Merge PR #19440 into master

* refs/pull/19440/head:
	mds: update mds option descriptions
	mds: obsolete MDSMap option configs
	mds: organize Filesystem class def

Reviewed-by: John Spray <john.spray@redhat.com>
This commit is contained in:
Patrick Donnelly 2017-12-15 11:07:58 -08:00
commit ee9d5f9a45
No known key found for this signature in database
GPG Key ID: 3A2A7E25BEA8AADB
21 changed files with 197 additions and 209 deletions

View File

@ -20,6 +20,11 @@
- mds remove_data_pool -> fs rm_data_pool
- mds rm_data_pool -> fs rm_data_pool
* New CephFS file system attributes session_timeout and session_autoclose
are configurable via `ceph fs set`. The MDS config options
mds_session_timeout, mds_session_autoclose, and mds_max_file_size are now
obsolete.
>= 12.2.2

View File

@ -25,10 +25,9 @@ fragments may be *merged* to reduce the number of fragments in the directory.
Splitting and merging
=====================
An MDS will only consider doing splits and merges if the ``mds_bal_frag``
setting is true in the MDS's configuration file, and the allow_dirfrags
setting is true in the filesystem map (set on the mons). These settings
are both true by default since the *Luminous* (12.2.x) release of Ceph.
An MDS will only consider doing splits if the allow_dirfrags setting is true in
the file system map (set on the mons). This setting is true by default since
the *Luminous* release (12.2.X).
When an MDS identifies a directory fragment to be split, it does not
do the split immediately. Because splitting interrupts metadata IO,

View File

@ -23,9 +23,9 @@ Automatic client eviction
There are two situations in which a client may be evicted automatically:
On an active MDS daemon, if a client has not communicated with the MDS for
over ``mds_session_autoclose`` seconds (300 seconds by default), then it
will be evicted automatically.
On an active MDS daemon, if a client has not communicated with the MDS for over
``session_autoclose`` (a file system variable) seconds (300 seconds by
default), then it will be evicted automatically.
During MDS startup (including on failover), the MDS passes through a
state called ``reconnect``. During this state, it waits for all the

View File

@ -69,7 +69,7 @@ are like locks. Sometimes, for example when another client needs access,
the MDS will request clients release their capabilities. If the client
is unresponsive or buggy, it might fail to do so promptly or fail to do
so at all. This message appears if a client has taken longer than
``mds_session_timeout`` (default 60s) to comply.
``session_timeout`` (default 60s) to comply.
Message: "Client *name* failing to respond to cache pressure"
Code: MDS_HEALTH_CLIENT_RECALL, MDS_HEALTH_CLIENT_RECALL_MANY

View File

@ -10,15 +10,6 @@
:Type: Boolean
:Default: ``true``
``mds max file size``
:Description: The maximum allowed file size to set when creating a
new file system.
:Type: 64-bit Integer Unsigned
:Default: ``1ULL << 40``
``mds cache memory limit``
:Description: The memory limit the MDS should enforce for its cache.
@ -107,24 +98,6 @@
:Default: ``24.0*60.0``
``mds session timeout``
:Description: The interval (in seconds) of client inactivity before Ceph
times out capabilities and leases.
:Type: Float
:Default: ``60``
``mds session autoclose``
:Description: The interval (in seconds) before Ceph closes
a laggy client's session.
:Type: Float
:Default: ``300``
``mds reconnect timeout``
:Description: The interval (in seconds) to wait for clients to reconnect
@ -254,13 +227,6 @@
:Default: ``0``
``mds bal frag``
:Description: Determines whether the MDS will fragment directories.
:Type: Boolean
:Default: ``false``
``mds bal split size``
:Description: The maximum directory size before the MDS will split a directory

View File

@ -437,14 +437,18 @@ class Filesystem(MDSCluster):
raise RuntimeError("cannot deactivate rank 0")
self.mon_manager.raw_cluster_cmd("mds", "deactivate", "%d:%d" % (self.id, rank))
def set_var(self, var, *args):
a = map(str, args)
self.mon_manager.raw_cluster_cmd("fs", "set", self.name, var, *a)
def set_max_mds(self, max_mds):
self.mon_manager.raw_cluster_cmd("fs", "set", self.name, "max_mds", "%d" % max_mds)
self.set_var("max_mds", "%d" % max_mds)
def set_allow_dirfrags(self, yes):
self.mon_manager.raw_cluster_cmd("fs", "set", self.name, "allow_dirfrags", str(yes).lower(), '--yes-i-really-mean-it')
self.set_var("allow_dirfrags", str(yes).lower(), '--yes-i-really-mean-it')
def set_allow_new_snaps(self, yes):
self.mon_manager.raw_cluster_cmd("fs", "set", self.name, "allow_new_snaps", str(yes).lower(), '--yes-i-really-mean-it')
self.set_var("allow_new_snaps", str(yes).lower(), '--yes-i-really-mean-it')
def get_pgs_per_fs_pool(self):
"""
@ -559,6 +563,9 @@ class Filesystem(MDSCluster):
def get_mds_map(self):
return self.status().get_fsmap(self.id)['mdsmap']
def get_var(self, var):
return self.status().get_fsmap(self.id)['mdsmap'][var]
def add_data_pool(self, name):
self.mon_manager.raw_cluster_cmd('osd', 'pool', 'create', name, self.get_pgs_per_fs_pool().__str__())
self.mon_manager.raw_cluster_cmd('fs', 'add_data_pool', self.name, name)

View File

@ -134,10 +134,10 @@ class TestClientLimits(CephFSTestCase):
# Client B tries to stat the file that client A created
rproc = self.mount_b.write_background("file1")
# After mds_session_timeout, we should see a health warning (extra lag from
# After session_timeout, we should see a health warning (extra lag from
# MDS beacon period)
mds_session_timeout = float(self.fs.get_config("mds_session_timeout"))
self.wait_for_health("MDS_CLIENT_LATE_RELEASE", mds_session_timeout + 10)
session_timeout = self.fs.get_var("session_timeout")
self.wait_for_health("MDS_CLIENT_LATE_RELEASE", session_timeout + 10)
# Client B should still be stuck
self.assertFalse(rproc.finished)

View File

@ -28,10 +28,9 @@ class TestClientNetworkRecovery(CephFSTestCase):
REQUIRE_ONE_CLIENT_REMOTE = True
CLIENTS_REQUIRED = 2
LOAD_SETTINGS = ["mds_session_timeout", "mds_reconnect_timeout", "ms_max_backoff"]
LOAD_SETTINGS = ["mds_reconnect_timeout", "ms_max_backoff"]
# Environment references
mds_session_timeout = None
mds_reconnect_timeout = None
ms_max_backoff = None
@ -43,6 +42,8 @@ class TestClientNetworkRecovery(CephFSTestCase):
I/O after failure.
"""
session_timeout = self.fs.get_var("session_timeout")
# We only need one client
self.mount_b.umount_wait()
@ -65,7 +66,7 @@ class TestClientNetworkRecovery(CephFSTestCase):
# ...then it should block
self.assertFalse(write_blocked.finished)
self.assert_session_state(client_id, "open")
time.sleep(self.mds_session_timeout * 1.5) # Long enough for MDS to consider session stale
time.sleep(session_timeout * 1.5) # Long enough for MDS to consider session stale
self.assertFalse(write_blocked.finished)
self.assert_session_state(client_id, "stale")
@ -85,10 +86,9 @@ class TestClientRecovery(CephFSTestCase):
REQUIRE_KCLIENT_REMOTE = True
CLIENTS_REQUIRED = 2
LOAD_SETTINGS = ["mds_session_timeout", "mds_reconnect_timeout", "ms_max_backoff"]
LOAD_SETTINGS = ["mds_reconnect_timeout", "ms_max_backoff"]
# Environment references
mds_session_timeout = None
mds_reconnect_timeout = None
ms_max_backoff = None
@ -212,6 +212,8 @@ class TestClientRecovery(CephFSTestCase):
self.mount_a.create_destroy()
def test_stale_caps(self):
session_timeout = self.fs.get_var("session_timeout")
# Capability release from stale session
# =====================================
cap_holder = self.mount_a.open_background()
@ -224,7 +226,7 @@ class TestClientRecovery(CephFSTestCase):
self.mount_a.kill()
try:
# Now, after mds_session_timeout seconds, the waiter should
# Now, after session_timeout seconds, the waiter should
# complete their operation when the MDS marks the holder's
# session stale.
cap_waiter = self.mount_b.write_background()
@ -237,9 +239,9 @@ class TestClientRecovery(CephFSTestCase):
cap_waited = b - a
log.info("cap_waiter waited {0}s".format(cap_waited))
self.assertTrue(self.mds_session_timeout / 2.0 <= cap_waited <= self.mds_session_timeout * 2.0,
self.assertTrue(session_timeout / 2.0 <= cap_waited <= session_timeout * 2.0,
"Capability handover took {0}, expected approx {1}".format(
cap_waited, self.mds_session_timeout
cap_waited, session_timeout
))
cap_holder.stdin.close()
@ -259,6 +261,8 @@ class TestClientRecovery(CephFSTestCase):
# Eviction while holding a capability
# ===================================
session_timeout = self.fs.get_var("session_timeout")
# Take out a write capability on a file on client A,
# and then immediately kill it.
cap_holder = self.mount_a.open_background()
@ -288,9 +292,9 @@ class TestClientRecovery(CephFSTestCase):
log.info("cap_waiter waited {0}s".format(cap_waited))
# This is the check that it happened 'now' rather than waiting
# for the session timeout
self.assertLess(cap_waited, self.mds_session_timeout / 2.0,
self.assertLess(cap_waited, session_timeout / 2.0,
"Capability handover took {0}, expected less than {1}".format(
cap_waited, self.mds_session_timeout / 2.0
cap_waited, session_timeout / 2.0
))
cap_holder.stdin.close()

View File

@ -33,7 +33,6 @@ class TestFragmentation(CephFSTestCase):
Apply kwargs as MDS configuration settings, enable dirfrags
and restart the MDSs.
"""
kwargs['mds_bal_frag'] = "true"
for k, v in kwargs.items():
self.ceph_cluster.set_ceph_conf("mds", k, v.__str__())

View File

@ -11,9 +11,6 @@ import json
class TestMisc(CephFSTestCase):
CLIENTS_REQUIRED = 2
LOAD_SETTINGS = ["mds_session_autoclose"]
mds_session_autoclose = None
def test_getattr_caps(self):
"""
Check if MDS recognizes the 'mask' parameter of open request.
@ -104,6 +101,8 @@ class TestMisc(CephFSTestCase):
only session
"""
session_autoclose = self.fs.get_var("session_autoclose")
self.mount_b.umount_wait()
ls_data = self.fs.mds_asok(['session', 'ls'])
self.assert_session_count(1, ls_data)
@ -111,7 +110,7 @@ class TestMisc(CephFSTestCase):
self.mount_a.kill()
self.mount_a.kill_cleanup()
time.sleep(self.mds_session_autoclose * 1.5)
time.sleep(session_autoclose * 1.5)
ls_data = self.fs.mds_asok(['session', 'ls'])
self.assert_session_count(1, ls_data)
@ -126,7 +125,7 @@ class TestMisc(CephFSTestCase):
self.mount_a.kill()
self.mount_a.kill_cleanup()
time.sleep(self.mds_session_autoclose * 1.5)
time.sleep(session_autoclose * 1.5)
ls_data = self.fs.mds_asok(['session', 'ls'])
self.assert_session_count(1, ls_data)

View File

@ -962,7 +962,7 @@ class TestStrays(CephFSTestCase):
max_purge_files = 2
self.set_conf('mds', 'mds_bal_frag', 'false')
self.fs.set_allow_dirfrags(True)
self.set_conf('mds', 'mds_max_purge_files', "%d" % max_purge_files)
self.fs.mds_fail_restart()
self.fs.wait_for_daemons()

View File

@ -398,7 +398,6 @@ OPTION(filer_max_purge_ops, OPT_U32)
OPTION(filer_max_truncate_ops, OPT_U32)
OPTION(mds_data, OPT_STR)
OPTION(mds_max_file_size, OPT_U64) // Used when creating new CephFS. Change with 'ceph fs set <fs_name> max_file_size <size>' afterwards
// max xattr kv pairs size for each dir/file
OPTION(mds_max_xattr_pairs_size, OPT_U32)
OPTION(mds_max_file_recover, OPT_U32)
@ -409,17 +408,15 @@ OPTION(mds_beacon_interval, OPT_FLOAT)
OPTION(mds_beacon_grace, OPT_FLOAT)
OPTION(mds_enforce_unique_name, OPT_BOOL)
OPTION(mds_session_timeout, OPT_FLOAT) // cap bits and leases time out if client unresponsive or not returning its caps
OPTION(mds_session_blacklist_on_timeout, OPT_BOOL) // whether to blacklist clients whose sessions are dropped due to timeout
OPTION(mds_session_blacklist_on_evict, OPT_BOOL) // whether to blacklist clients whose sessions are dropped via admin commands
OPTION(mds_sessionmap_keys_per_op, OPT_U32) // how many sessions should I try to load/store in a single OMAP operation?
OPTION(mds_recall_state_timeout, OPT_FLOAT) // detect clients which aren't trimming caps
OPTION(mds_freeze_tree_timeout, OPT_FLOAT) // detecting freeze tree deadlock
OPTION(mds_session_autoclose, OPT_FLOAT) // autoclose idle session
OPTION(mds_health_summarize_threshold, OPT_INT) // collapse N-client health metrics to a single 'many'
OPTION(mds_reconnect_timeout, OPT_FLOAT) // seconds to wait for clients during mds restart
// make it (mds_session_timeout - mds_beacon_grace)
// make it (mdsmap.session_timeout - mds_beacon_grace)
OPTION(mds_tick_interval, OPT_FLOAT)
OPTION(mds_dirstat_min_interval, OPT_FLOAT) // try to avoid propagating more often than this
OPTION(mds_scatter_nudge_interval, OPT_FLOAT) // how quickly dirstat changes propagate up the hierarchy
@ -436,7 +433,6 @@ OPTION(mds_bal_export_pin, OPT_BOOL) // allow clients to pin directory trees to
OPTION(mds_bal_sample_interval, OPT_DOUBLE) // every 3 seconds
OPTION(mds_bal_replicate_threshold, OPT_FLOAT)
OPTION(mds_bal_unreplicate_threshold, OPT_FLOAT)
OPTION(mds_bal_frag, OPT_BOOL)
OPTION(mds_bal_split_size, OPT_INT)
OPTION(mds_bal_split_rd, OPT_FLOAT)
OPTION(mds_bal_split_wr, OPT_FLOAT)

View File

@ -6006,15 +6006,11 @@ std::vector<Option> get_mds_options() {
return std::vector<Option>({
Option("mds_data", Option::TYPE_STR, Option::LEVEL_ADVANCED)
.set_default("/var/lib/ceph/mds/$cluster-$id")
.set_description(""),
Option("mds_max_file_size", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
.set_default(1ULL << 40)
.set_description(""),
.set_description("path to MDS data and keyring"),
Option("mds_max_xattr_pairs_size", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
.set_default(64_K)
.set_description(""),
.set_description("maximum aggregate size of extended attributes on a file"),
Option("mds_cache_size", Option::TYPE_INT, Option::LEVEL_ADVANCED)
.set_default(0)
@ -6028,7 +6024,7 @@ std::vector<Option> get_mds_options() {
Option("mds_cache_reservation", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
.set_default(.05)
.set_description("amount of memory to reserve"),
.set_description("amount of memory to reserve for future cached objects"),
Option("mds_health_cache_threshold", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
.set_default(1.5)
@ -6036,245 +6032,234 @@ std::vector<Option> get_mds_options() {
Option("mds_cache_mid", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
.set_default(.7)
.set_description(""),
.set_description("midpoint for MDS cache LRU"),
Option("mds_max_file_recover", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
.set_default(32)
.set_description(""),
.set_description("maximum number of files to recover file sizes in parallel"),
Option("mds_dir_max_commit_size", Option::TYPE_INT, Option::LEVEL_ADVANCED)
.set_default(10)
.set_description(""),
.set_description("maximum size in megabytes for a RADOS write to a directory"),
Option("mds_dir_keys_per_op", Option::TYPE_INT, Option::LEVEL_ADVANCED)
.set_default(16384)
.set_description(""),
.set_description("number of directory entries to read in one RADOS operation"),
Option("mds_decay_halflife", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
.set_default(5)
.set_description(""),
.set_description("rate of decay for temperature counters on each directory for balancing"),
Option("mds_beacon_interval", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
.set_default(4)
.set_description(""),
.set_description("interval in seconds between MDS beacons to monitors"),
Option("mds_beacon_grace", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
.set_default(15)
.set_description(""),
.set_description("tolerance in seconds for missed beacons from monitors"),
Option("mds_enforce_unique_name", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
.set_default(true)
.set_description(""),
Option("mds_session_timeout", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
.set_default(60)
.set_description(""),
.set_description("require MDS name is unique in the cluster"),
Option("mds_session_blacklist_on_timeout", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
.set_default(true)
.set_description(""),
.set_description("blacklist clients whose sessions have become stale"),
Option("mds_session_blacklist_on_evict", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
.set_default(true)
.set_description(""),
.set_description("blacklist clients that have been evicted"),
Option("mds_sessionmap_keys_per_op", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
.set_default(1024)
.set_description(""),
.set_description("number of omap keys to read from the SessionMap in one operation"),
Option("mds_recall_state_timeout", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
.set_default(60)
.set_description(""),
.set_description("timeout for clients late on cap recall to create health warnings"),
Option("mds_freeze_tree_timeout", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
Option("mds_freeze_tree_timeout", Option::TYPE_FLOAT, Option::LEVEL_DEV)
.set_default(30)
.set_description(""),
Option("mds_session_autoclose", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
.set_default(300)
.set_description(""),
Option("mds_health_summarize_threshold", Option::TYPE_INT, Option::LEVEL_ADVANCED)
.set_default(10)
.set_description(""),
.set_description("threshold of number of clients to summarize late client recall"),
Option("mds_reconnect_timeout", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
.set_default(45)
.set_description(""),
.set_description("timeout in seconds to wait for clients to reconnect during MDS reconnect recovery state"),
Option("mds_tick_interval", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
.set_default(5)
.set_description(""),
.set_description("time in seconds between upkeep tasks"),
Option("mds_dirstat_min_interval", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
Option("mds_dirstat_min_interval", Option::TYPE_FLOAT, Option::LEVEL_DEV)
.set_default(1)
.set_description(""),
Option("mds_scatter_nudge_interval", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
.set_default(5)
.set_description(""),
.set_description("minimum interval between scatter lock updates"),
Option("mds_client_prealloc_inos", Option::TYPE_INT, Option::LEVEL_ADVANCED)
.set_default(1000)
.set_description(""),
.set_description("number of unused inodes to pre-allocate to clients for file creation"),
Option("mds_early_reply", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
.set_default(true)
.set_description(""),
.set_description("additional reply to clients that metadata requests are complete but not yet durable"),
Option("mds_default_dir_hash", Option::TYPE_INT, Option::LEVEL_ADVANCED)
.set_default(CEPH_STR_HASH_RJENKINS)
.set_description(""),
.set_description("hash function to select directory fragment for dentry name"),
Option("mds_log_pause", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
Option("mds_log_pause", Option::TYPE_BOOL, Option::LEVEL_DEV)
.set_default(false)
.set_description(""),
Option("mds_log_skip_corrupt_events", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
Option("mds_log_skip_corrupt_events", Option::TYPE_BOOL, Option::LEVEL_DEV)
.set_default(false)
.set_description(""),
Option("mds_log_max_events", Option::TYPE_INT, Option::LEVEL_ADVANCED)
.set_default(-1)
.set_description(""),
.set_description("maximum number of events in the MDS journal (-1 is unlimited)"),
Option("mds_log_events_per_segment", Option::TYPE_INT, Option::LEVEL_ADVANCED)
.set_default(1024)
.set_description(""),
.set_description("maximum number of events in an MDS journal segment"),
Option("mds_log_segment_size", Option::TYPE_INT, Option::LEVEL_ADVANCED)
.set_default(0)
.set_description(""),
.set_description("size in bytes of each MDS log segment"),
Option("mds_log_max_segments", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
.set_default(30)
.set_description(""),
.set_description("maximum number of segments which may be untrimmed"),
Option("mds_bal_export_pin", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
.set_default(true)
.set_description(""),
.set_description("allow setting directory export pins to particular ranks"),
Option("mds_bal_sample_interval", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
.set_default(3.0)
.set_description(""),
.set_description("interval in seconds between balancer ticks"),
Option("mds_bal_replicate_threshold", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
.set_default(8000)
.set_description(""),
.set_description("hot popularity threshold to replicate a subtree"),
Option("mds_bal_unreplicate_threshold", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
.set_default(0)
.set_description(""),
Option("mds_bal_frag", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
.set_default(true)
.set_description(""),
.set_description("cold popularity threshold to merge subtrees"),
Option("mds_bal_split_size", Option::TYPE_INT, Option::LEVEL_ADVANCED)
.set_default(10000)
.set_description(""),
.set_description("minimum size of directory fragment before splitting"),
Option("mds_bal_split_rd", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
.set_default(25000)
.set_description(""),
.set_description("hot read popularity threshold for splitting a directory fragment"),
Option("mds_bal_split_wr", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
.set_default(10000)
.set_description(""),
.set_description("hot write popularity threshold for splitting a directory fragment"),
Option("mds_bal_split_bits", Option::TYPE_INT, Option::LEVEL_ADVANCED)
.set_default(3)
.set_description(""),
.set_min_max(1, 24)
.set_description("power of two child fragments for a fragment on split"),
Option("mds_bal_merge_size", Option::TYPE_INT, Option::LEVEL_ADVANCED)
.set_default(50)
.set_description(""),
.set_description("size of fragments where merging should occur"),
Option("mds_bal_interval", Option::TYPE_INT, Option::LEVEL_ADVANCED)
.set_default(10)
.set_description(""),
.set_description("interval between MDS balancer cycles"),
Option("mds_bal_fragment_interval", Option::TYPE_INT, Option::LEVEL_ADVANCED)
.set_default(5)
.set_description(""),
.set_description("delay in seconds before interrupting client IO to perform splits"),
Option("mds_bal_fragment_size_max", Option::TYPE_INT, Option::LEVEL_ADVANCED)
.set_default(10000*10)
.set_description(""),
.set_description("maximum size of a directory fragment before new creat/links fail"),
Option("mds_bal_fragment_fast_factor", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
.set_default(1.5)
.set_description(""),
.set_description("ratio of mds_bal_split_size at which fast fragment splitting occurs"),
Option("mds_bal_idle_threshold", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
.set_default(0)
.set_description(""),
.set_description("idle metadata popularity threshold before rebalancing"),
Option("mds_bal_max", Option::TYPE_INT, Option::LEVEL_ADVANCED)
Option("mds_bal_max", Option::TYPE_INT, Option::LEVEL_DEV)
.set_default(-1)
.set_description(""),
Option("mds_bal_max_until", Option::TYPE_INT, Option::LEVEL_ADVANCED)
Option("mds_bal_max_until", Option::TYPE_INT, Option::LEVEL_DEV)
.set_default(-1)
.set_description(""),
Option("mds_bal_mode", Option::TYPE_INT, Option::LEVEL_ADVANCED)
Option("mds_bal_mode", Option::TYPE_INT, Option::LEVEL_DEV)
.set_default(0)
.set_description(""),
Option("mds_bal_min_rebalance", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
Option("mds_bal_min_rebalance", Option::TYPE_FLOAT, Option::LEVEL_DEV)
.set_default(.1)
.set_description(""),
.set_description("amount overloaded over internal target before balancer begins offloading"),
Option("mds_bal_min_start", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
Option("mds_bal_min_start", Option::TYPE_FLOAT, Option::LEVEL_DEV)
.set_default(.2)
.set_description(""),
Option("mds_bal_need_min", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
Option("mds_bal_need_min", Option::TYPE_FLOAT, Option::LEVEL_DEV)
.set_default(.8)
.set_description(""),
Option("mds_bal_need_max", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
Option("mds_bal_need_max", Option::TYPE_FLOAT, Option::LEVEL_DEV)
.set_default(1.2)
.set_description(""),
Option("mds_bal_midchunk", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
Option("mds_bal_midchunk", Option::TYPE_FLOAT, Option::LEVEL_DEV)
.set_default(.3)
.set_description(""),
Option("mds_bal_minchunk", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
Option("mds_bal_minchunk", Option::TYPE_FLOAT, Option::LEVEL_DEV)
.set_default(.001)
.set_description(""),
Option("mds_bal_target_decay", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
.set_default(10.0)
.set_description(""),
.set_description("rate of decay for export targets communicated to clients"),
Option("mds_replay_interval", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
.set_default(1.0)
.set_description(""),
.set_description("time in seconds between replay of updates to journal by standby replay MDS"),
Option("mds_shutdown_check", Option::TYPE_INT, Option::LEVEL_ADVANCED)
Option("mds_shutdown_check", Option::TYPE_INT, Option::LEVEL_DEV)
.set_default(0)
.set_description(""),
Option("mds_thrash_exports", Option::TYPE_INT, Option::LEVEL_ADVANCED)
Option("mds_thrash_exports", Option::TYPE_INT, Option::LEVEL_DEV)
.set_default(0)
.set_description(""),
Option("mds_thrash_fragments", Option::TYPE_INT, Option::LEVEL_ADVANCED)
Option("mds_thrash_fragments", Option::TYPE_INT, Option::LEVEL_DEV)
.set_default(0)
.set_description(""),
Option("mds_dump_cache_on_map", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
Option("mds_dump_cache_on_map", Option::TYPE_BOOL, Option::LEVEL_DEV)
.set_default(false)
.set_description(""),
Option("mds_dump_cache_after_rejoin", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
Option("mds_dump_cache_after_rejoin", Option::TYPE_BOOL, Option::LEVEL_DEV)
.set_default(false)
.set_description(""),
Option("mds_verify_scatter", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
Option("mds_verify_scatter", Option::TYPE_BOOL, Option::LEVEL_DEV)
.set_default(false)
.set_description(""),
@ -6334,7 +6319,7 @@ std::vector<Option> get_mds_options() {
.set_default(0)
.set_description(""),
Option("mds_journal_format", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
Option("mds_journal_format", Option::TYPE_UINT, Option::LEVEL_DEV)
.set_default(1)
.set_description(""),
@ -6346,121 +6331,121 @@ std::vector<Option> get_mds_options() {
.set_default(0)
.set_description(""),
Option("mds_wipe_sessions", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
Option("mds_wipe_sessions", Option::TYPE_BOOL, Option::LEVEL_DEV)
.set_default(0)
.set_description(""),
Option("mds_wipe_ino_prealloc", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
Option("mds_wipe_ino_prealloc", Option::TYPE_BOOL, Option::LEVEL_DEV)
.set_default(0)
.set_description(""),
Option("mds_skip_ino", Option::TYPE_INT, Option::LEVEL_ADVANCED)
Option("mds_skip_ino", Option::TYPE_INT, Option::LEVEL_DEV)
.set_default(0)
.set_description(""),
Option("mds_standby_for_name", Option::TYPE_STR, Option::LEVEL_ADVANCED)
.set_default("")
.set_description(""),
.set_description("standby for named MDS daemon when not active"),
Option("mds_standby_for_rank", Option::TYPE_INT, Option::LEVEL_ADVANCED)
Option("mds_standby_for_rank", Option::TYPE_INT, Option::LEVEL_BASIC)
.set_default(-1)
.set_description(""),
.set_description("allow MDS to become a standby:replay daemon"),
Option("mds_standby_for_fscid", Option::TYPE_INT, Option::LEVEL_ADVANCED)
.set_default(-1)
.set_description(""),
.set_description("standby only for the file system with the given fscid"),
Option("mds_standby_replay", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
Option("mds_standby_replay", Option::TYPE_BOOL, Option::LEVEL_BASIC)
.set_default(false)
.set_description(""),
.set_description("allow MDS to standby replay for an active MDS"),
Option("mds_enable_op_tracker", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
.set_default(true)
.set_description(""),
.set_description("track remote operation progression and statistics"),
Option("mds_op_history_size", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
.set_default(20)
.set_description(""),
.set_description("maximum size for list of historical operations"),
Option("mds_op_history_duration", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
.set_default(600)
.set_description(""),
.set_description("expiration time in seconds of historical operations"),
Option("mds_op_complaint_time", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
.set_default(30)
.set_description(""),
.set_description("time in seconds to consider an operation blocked after no updates"),
Option("mds_op_log_threshold", Option::TYPE_INT, Option::LEVEL_ADVANCED)
Option("mds_op_log_threshold", Option::TYPE_INT, Option::LEVEL_DEV)
.set_default(5)
.set_description(""),
Option("mds_snap_min_uid", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
.set_default(0)
.set_description(""),
.set_description("minimum uid of client to perform snapshots"),
Option("mds_snap_max_uid", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
.set_default(4294967294)
.set_description(""),
.set_description("maximum uid of client to perform snapshots"),
Option("mds_snap_rstat", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
.set_default(false)
.set_description(""),
.set_description("enabled nested rstat for snapshots"),
Option("mds_verify_backtrace", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
Option("mds_verify_backtrace", Option::TYPE_UINT, Option::LEVEL_DEV)
.set_default(1)
.set_description(""),
Option("mds_max_completed_flushes", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
Option("mds_max_completed_flushes", Option::TYPE_UINT, Option::LEVEL_DEV)
.set_default(100000)
.set_description(""),
Option("mds_max_completed_requests", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
Option("mds_max_completed_requests", Option::TYPE_UINT, Option::LEVEL_DEV)
.set_default(100000)
.set_description(""),
Option("mds_action_on_write_error", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
.set_default(1)
.set_description(""),
.set_description("action to take when MDS cannot write to RADOS (0:ignore, 1:read-only, 2:suicide)"),
Option("mds_mon_shutdown_timeout", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
.set_default(5)
.set_description(""),
.set_description("time to wait for mon to receive damaged MDS rank notification"),
Option("mds_max_purge_files", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
.set_default(64)
.set_description(""),
.set_description("maximum number of deleted files to purge in parallel"),
Option("mds_max_purge_ops", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
.set_default(8192)
.set_description(""),
.set_description("maximum number of purge operations performed in parallel"),
Option("mds_max_purge_ops_per_pg", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
.set_default(0.5)
.set_description(""),
.set_description("number of parallel purge operations performed per PG"),
Option("mds_purge_queue_busy_flush_period", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
Option("mds_purge_queue_busy_flush_period", Option::TYPE_FLOAT, Option::LEVEL_DEV)
.set_default(1.0)
.set_description(""),
Option("mds_root_ino_uid", Option::TYPE_INT, Option::LEVEL_ADVANCED)
.set_default(0)
.set_description(""),
.set_description("default uid for new root directory"),
Option("mds_root_ino_gid", Option::TYPE_INT, Option::LEVEL_ADVANCED)
.set_default(0)
.set_description(""),
.set_description("default gid for new root directory"),
Option("mds_max_scrub_ops_in_progress", Option::TYPE_INT, Option::LEVEL_ADVANCED)
.set_default(5)
.set_description(""),
.set_description("maximum number of scrub operations performed in parallel"),
Option("mds_damage_table_max_entries", Option::TYPE_INT, Option::LEVEL_ADVANCED)
.set_default(10000)
.set_description(""),
.set_description("maximum number of damage table entries"),
Option("mds_client_writeable_range_max_inc_objs", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
.set_default(1024)
.set_description(""),
.set_description("maximum number of objects in writeable range of a file for a client"),
Option("mds_min_caps_per_client", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
.set_default(100)

View File

@ -231,16 +231,12 @@ void FSMap::create_filesystem(const std::string &name,
{
auto fs = std::make_shared<Filesystem>();
fs->mds_map.fs_name = name;
fs->mds_map.max_mds = 1;
fs->mds_map.data_pools.push_back(data_pool);
fs->mds_map.metadata_pool = metadata_pool;
fs->mds_map.cas_pool = -1;
fs->mds_map.max_file_size = g_conf->mds_max_file_size;
fs->mds_map.compat = compat;
fs->mds_map.created = ceph_clock_now();
fs->mds_map.modified = ceph_clock_now();
fs->mds_map.session_timeout = g_conf->mds_session_timeout;
fs->mds_map.session_autoclose = g_conf->mds_session_autoclose;
fs->mds_map.enabled = true;
if (features & CEPH_FEATURE_SERVER_JEWEL) {
fs->fscid = next_filesystem_id++;
@ -275,17 +271,13 @@ void FSMap::reset_filesystem(fs_cluster_id_t fscid)
// Carry forward what makes sense
new_fs->fscid = fs->fscid;
new_fs->mds_map.inline_data_enabled = fs->mds_map.inline_data_enabled;
new_fs->mds_map.max_mds = 1;
new_fs->mds_map.data_pools = fs->mds_map.data_pools;
new_fs->mds_map.metadata_pool = fs->mds_map.metadata_pool;
new_fs->mds_map.cas_pool = fs->mds_map.cas_pool;
new_fs->mds_map.fs_name = fs->mds_map.fs_name;
new_fs->mds_map.max_file_size = g_conf->mds_max_file_size;
new_fs->mds_map.compat = compat;
new_fs->mds_map.created = ceph_clock_now();
new_fs->mds_map.modified = ceph_clock_now();
new_fs->mds_map.session_timeout = g_conf->mds_session_timeout;
new_fs->mds_map.session_autoclose = g_conf->mds_session_autoclose;
new_fs->mds_map.standby_count_wanted = fs->mds_map.standby_count_wanted;
new_fs->mds_map.enabled = true;

View File

@ -45,19 +45,16 @@ class health_check_map_t;
*/
class Filesystem
{
public:
fs_cluster_id_t fscid;
MDSMap mds_map;
void encode(bufferlist& bl, uint64_t features) const;
void decode(bufferlist::iterator& p);
public:
Filesystem()
:
fscid(FS_CLUSTER_ID_NONE)
{
}
void encode(bufferlist& bl, uint64_t features) const;
void decode(bufferlist::iterator& p);
void dump(Formatter *f) const;
void print(std::ostream& out) const;
@ -77,6 +74,9 @@ class Filesystem
return false;
}
fs_cluster_id_t fscid;
MDSMap mds_map;
};
WRITE_CLASS_ENCODER_FEATURES(Filesystem)

View File

@ -3569,7 +3569,7 @@ void Locker::remove_client_cap(CInode *in, client_t client)
/**
* Return true if any currently revoking caps exceed the
* mds_session_timeout threshold.
* session_timeout threshold.
*/
bool Locker::any_late_revoking_caps(xlist<Capability*> const &revoking) const
{
@ -3580,7 +3580,7 @@ bool Locker::any_late_revoking_caps(xlist<Capability*> const &revoking) const
} else {
utime_t now = ceph_clock_now();
utime_t age = now - (*p)->get_last_revoke_stamp();
if (age <= g_conf->mds_session_timeout) {
if (age <= mds->mdsmap->get_session_timeout()) {
return false;
} else {
return true;
@ -3644,8 +3644,8 @@ void Locker::caps_tick()
utime_t age = now - cap->get_last_revoke_stamp();
dout(20) << __func__ << " age = " << age << cap->get_client() << "." << cap->get_inode()->ino() << dendl;
if (age <= g_conf->mds_session_timeout) {
dout(20) << __func__ << " age below timeout " << g_conf->mds_session_timeout << dendl;
if (age <= mds->mdsmap->get_session_timeout()) {
dout(20) << __func__ << " age below timeout " << mds->mdsmap->get_session_timeout() << dendl;
break;
} else {
++n;
@ -3656,7 +3656,7 @@ void Locker::caps_tick()
}
}
// exponential backoff of warning intervals
if (age > g_conf->mds_session_timeout * (1 << cap->get_num_revoke_warnings())) {
if (age > mds->mdsmap->get_session_timeout() * (1 << cap->get_num_revoke_warnings())) {
cap->inc_num_revoke_warnings();
stringstream ss;
ss << "client." << cap->get_client() << " isn't responding to mclientcaps(revoke), ino "

View File

@ -1079,7 +1079,7 @@ void MDBalancer::hit_inode(utime_t now, CInode *in, int type, int who)
void MDBalancer::maybe_fragment(CDir *dir, bool hot)
{
// split/merge
if (g_conf->mds_bal_frag && g_conf->mds_bal_fragment_interval > 0 &&
if (g_conf->mds_bal_fragment_interval > 0 &&
!dir->inode->is_base() && // not root/base (for now at least)
dir->is_auth()) {

View File

@ -230,12 +230,12 @@ public:
flags(CEPH_MDSMAP_DEFAULTS), last_failure(0),
last_failure_osd_epoch(0),
tableserver(0), root(0),
session_timeout(0),
session_autoclose(0),
max_file_size(0),
session_timeout(60),
session_autoclose(300),
max_file_size(1ULL<<40), /* 1TB */
cas_pool(-1),
metadata_pool(-1),
max_mds(0),
max_mds(1),
standby_count_wanted(-1),
ever_allowed_features(0),
explicitly_allowed_features(0),
@ -249,10 +249,16 @@ public:
utime_t get_session_timeout() const {
return utime_t(session_timeout,0);
}
void set_session_timeout(uint32_t t) {
session_timeout = t;
}
utime_t get_session_autoclose() const {
return utime_t(session_autoclose, 0);
}
void set_session_autoclose(uint32_t t) {
session_autoclose = t;
}
uint64_t get_max_filesize() const { return max_file_size; }
void set_max_filesize(uint64_t m) { max_file_size = m; }

View File

@ -692,7 +692,7 @@ void Server::find_idle_sessions()
// (caps go stale, lease die)
utime_t now = ceph_clock_now();
utime_t cutoff = now;
cutoff -= g_conf->mds_session_timeout;
cutoff -= mds->mdsmap->get_session_timeout();
while (1) {
Session *session = mds->sessionmap.get_oldest_session(Session::STATE_OPEN);
if (!session) break;
@ -713,7 +713,7 @@ void Server::find_idle_sessions()
// autoclose
cutoff = now;
cutoff -= g_conf->mds_session_autoclose;
cutoff -= mds->mdsmap->get_session_autoclose();
// don't kick clients if we've been laggy
if (mds->get_laggy_until() > cutoff) {

View File

@ -453,6 +453,36 @@ public:
{
fs->mds_map.set_standby_count_wanted(n);
});
} else if (var == "session_timeout") {
if (interr.length()) {
ss << var << " requires an integer value";
return -EINVAL;
}
if (n < 30) {
ss << var << " must be at least 30s";
return -ERANGE;
}
fsmap.modify_filesystem(
fs->fscid,
[n](std::shared_ptr<Filesystem> fs)
{
fs->mds_map.set_session_timeout((uint32_t)n);
});
} else if (var == "session_autoclose") {
if (interr.length()) {
ss << var << " requires an integer value";
return -EINVAL;
}
if (n < 30) {
ss << var << " must be at least 30s";
return -ERANGE;
}
fsmap.modify_filesystem(
fs->fscid,
[n](std::shared_ptr<Filesystem> fs)
{
fs->mds_map.set_session_autoclose((uint32_t)n);
});
} else {
ss << "unknown variable " << var;
return -EINVAL;

View File

@ -388,10 +388,10 @@ COMMAND("fs set " \
"name=fs_name,type=CephString " \
"name=var,type=CephChoices,strings=max_mds|max_file_size"
"|allow_new_snaps|inline_data|cluster_down|allow_multimds|allow_dirfrags|balancer" \
"|standby_count_wanted " \
"|standby_count_wanted|session_timeout|session_autoclose " \
"name=val,type=CephString " \
"name=confirm,type=CephString,req=false", \
"set mds parameter <var> to <val>", "mds", "rw", "cli,rest")
"set fs parameter <var> to <val>", "mds", "rw", "cli,rest")
COMMAND("fs flag set name=flag_name,type=CephChoices,strings=enable_multiple "
"name=val,type=CephString " \
"name=confirm,type=CephChoices,strings=--yes-i-really-mean-it,req=false", \