diff --git a/PendingReleaseNotes b/PendingReleaseNotes index 47f5b55e3c6..5ecdae2050d 100644 --- a/PendingReleaseNotes +++ b/PendingReleaseNotes @@ -20,7 +20,9 @@ string will no longer be able to set quotas or any layout fields. This flag previously only restricted modification of the pool and namespace fields in layouts. - +* CephFS directory fragmentation (large directory support) is enabled + by default on new filesystems. To enable it on existing filesystems + use "ceph fs set allow_dirfrags". 12.0.0 ------ diff --git a/doc/cephfs/dirfrags.rst b/doc/cephfs/dirfrags.rst index e6bd045b225..717553fea9a 100644 --- a/doc/cephfs/dirfrags.rst +++ b/doc/cephfs/dirfrags.rst @@ -14,7 +14,7 @@ here should be left at their default values. While directory fragmentation enables CephFS to handle very large numbers of entries in a single directory, application programmers should -remain cautious about creating very large directories, as they still +remain conservative about creating very large directories, as they still have a resource cost in situations such as a CephFS client listing the directory, where all the fragments must be loaded at once. @@ -27,7 +27,8 @@ Splitting and merging An MDS will only consider doing splits and merges if the ``mds_bal_frag`` setting is true in the MDS's configuration file, and the allow_dirfrags -setting is true in the filesystem map (set on the mons). +setting is true in the filesystem map (set on the mons). These settings +are both true by default since the *Luminous* (12.2.x) release of Ceph. When an MDS identifies a directory fragment to be split, it does not do the split immediately. Because splitting interrupts metadata IO, diff --git a/doc/cephfs/experimental-features.rst b/doc/cephfs/experimental-features.rst index 1f6e3c2af41..bdfa998a99e 100644 --- a/doc/cephfs/experimental-features.rst +++ b/doc/cephfs/experimental-features.rst @@ -12,18 +12,6 @@ what is required to enable them. Note that doing so will *irrevocably* flag maps in the monitor as having once enabled this flag to improve debugging and support processes. - -Directory Fragmentation ------------------------ -CephFS directories are generally stored within a single RADOS object. But this has -certain negative results once they become large enough. The filesystem is capable -of "fragmenting" these directories into multiple objects. There are no known bugs -with doing so but it is not sufficiently tested to support at this time. - -Directory fragmentation has always been off by default and required setting -```mds bal frag = true`` in the MDS' config file. It has been further protected -by requiring the user to set the "allow_dirfrags" flag for Jewel. - Inline data ----------- By default, all CephFS file data is stored in RADOS objects. The inline data @@ -97,3 +85,20 @@ and may not work together; see above. Multiple filesystems were available starting in the Jewel release candidates but were protected behind the "enable_multiple" flag before the final release. + + +Previously experimental features +================================ + +Directory Fragmentation +----------------------- + +Directory fragmentation was considered experimental prior to the *Luminous* +(12.2.x). It is now enabled by default on new filesystems. To enable directory +fragmentation on filesystems created with older versions of Ceph, set +the ``allow_dirfrags`` flag on the filesystem: + +:: + + ceph fs set allow_dirfrags + diff --git a/qa/cephfs/tasks/cfuse_workunit_suites_blogbench.yaml b/qa/cephfs/tasks/cfuse_workunit_suites_blogbench.yaml index 4c1fcc11ed9..2d370d7ef9f 100644 --- a/qa/cephfs/tasks/cfuse_workunit_suites_blogbench.yaml +++ b/qa/cephfs/tasks/cfuse_workunit_suites_blogbench.yaml @@ -1,4 +1,8 @@ tasks: +- check-counter: + counters: + mds: + - "mds.dir_split" - workunit: clients: all: diff --git a/qa/cephfs/tasks/cfuse_workunit_suites_ffsb.yaml b/qa/cephfs/tasks/cfuse_workunit_suites_ffsb.yaml index 4a2a627fe5d..9b1578900d2 100644 --- a/qa/cephfs/tasks/cfuse_workunit_suites_ffsb.yaml +++ b/qa/cephfs/tasks/cfuse_workunit_suites_ffsb.yaml @@ -4,6 +4,10 @@ overrides: osd: filestore flush min: 0 tasks: +- check-counter: + counters: + mds: + - "mds.dir_split" - workunit: clients: all: diff --git a/qa/cephfs/tasks/libcephfs_interface_tests.yaml b/qa/cephfs/tasks/libcephfs_interface_tests.yaml index fb3a05f4854..c59775259fc 100644 --- a/qa/cephfs/tasks/libcephfs_interface_tests.yaml +++ b/qa/cephfs/tasks/libcephfs_interface_tests.yaml @@ -4,6 +4,10 @@ overrides: kclient: disabled: true tasks: +- check-counter: + counters: + mds: + - "mds.dir_split" - workunit: clients: client.0: diff --git a/qa/suites/fs/basic/tasks/cfuse_workunit_kernel_untar_build.yaml b/qa/suites/fs/basic/tasks/cfuse_workunit_kernel_untar_build.yaml index 8dbc24a9feb..1e71bb401ae 100644 --- a/qa/suites/fs/basic/tasks/cfuse_workunit_kernel_untar_build.yaml +++ b/qa/suites/fs/basic/tasks/cfuse_workunit_kernel_untar_build.yaml @@ -4,6 +4,10 @@ overrides: client: fuse_default_permissions: 0 tasks: +- check-counter: + counters: + mds: + - "mds.dir_split" - workunit: clients: all: diff --git a/qa/suites/fs/basic/tasks/cfuse_workunit_misc.yaml b/qa/suites/fs/basic/tasks/cfuse_workunit_misc.yaml index 5d54f3da0b4..fac769ed5d9 100644 --- a/qa/suites/fs/basic/tasks/cfuse_workunit_misc.yaml +++ b/qa/suites/fs/basic/tasks/cfuse_workunit_misc.yaml @@ -1,4 +1,8 @@ tasks: +- check-counter: + counters: + mds: + - "mds.dir_split" - workunit: timeout: 6h clients: diff --git a/qa/suites/fs/basic/tasks/cfuse_workunit_norstats.yaml b/qa/suites/fs/basic/tasks/cfuse_workunit_norstats.yaml index 4833371df1b..bfed71c1b51 100644 --- a/qa/suites/fs/basic/tasks/cfuse_workunit_norstats.yaml +++ b/qa/suites/fs/basic/tasks/cfuse_workunit_norstats.yaml @@ -1,4 +1,8 @@ tasks: +- check-counter: + counters: + mds: + - "mds.dir_split" - workunit: timeout: 6h clients: diff --git a/qa/suites/fs/basic/tasks/cfuse_workunit_suites_fsx.yaml b/qa/suites/fs/basic/tasks/cfuse_workunit_suites_fsx.yaml index 8b2b1ab5c14..b16cfb17d60 100644 --- a/qa/suites/fs/basic/tasks/cfuse_workunit_suites_fsx.yaml +++ b/qa/suites/fs/basic/tasks/cfuse_workunit_suites_fsx.yaml @@ -1,4 +1,8 @@ tasks: +- check-counter: + counters: + mds: + - "mds.dir_split" - workunit: clients: all: diff --git a/qa/tasks/check_counter.py b/qa/tasks/check_counter.py new file mode 100644 index 00000000000..a3d84e00fe1 --- /dev/null +++ b/qa/tasks/check_counter.py @@ -0,0 +1,96 @@ + +import logging +import json + +from teuthology.task import Task +from teuthology import misc +import ceph_manager + +log = logging.getLogger(__name__) + + +class CheckCounter(Task): + """ + Use this task to validate that some daemon perf counters were + incremented by the nested tasks. + + Config: + 'cluster_name': optional, specify which cluster + 'target': dictionary of daemon type to list of performance counters. + 'dry_run': just log the value of the counters, don't fail if they + aren't nonzero. + + Success condition is that for all of the named counters, at least + one of the daemons of that type has the counter nonzero. + + Example to check cephfs dirfrag splits are happening: + - install: + - ceph: + - ceph-fuse: + - check-counter: + counters: + mds: + - "mds.dir_split" + - workunit: ... + """ + + def start(self): + log.info("START") + + def end(self): + cluster_name = self.config.get('cluster_name', None) + dry_run = self.config.get('dry_run', False) + targets = self.config.get('counters', {}) + + if cluster_name is None: + cluster_name = self.ctx.managers.keys()[0] + + for daemon_type, counters in targets.items(): + # List of 'a', 'b', 'c'... + daemon_ids = list(misc.all_roles_of_type(self.ctx.cluster, daemon_type)) + daemons = dict([(daemon_id, + self.ctx.daemons.get_daemon(daemon_type, daemon_id)) + for daemon_id in daemon_ids]) + + seen = set() + + for daemon_id, daemon in daemons.items(): + if not daemon.running(): + log.info("Ignoring daemon {0}, it isn't running".format(daemon_id)) + continue + else: + log.debug("Getting stats from {0}".format(daemon_id)) + + manager = self.ctx.managers[cluster_name] + proc = manager.admin_socket(daemon_type, daemon_id, ["perf", "dump"]) + response_data = proc.stdout.getvalue().strip() + if response_data: + perf_dump = json.loads(response_data) + else: + log.warning("No admin socket response from {0}, skipping".format(daemon_id)) + continue + + for counter in counters: + subsys, counter_id = counter.split(".") + if subsys not in perf_dump or counter_id not in perf_dump[subsys]: + log.warning("Counter '{0}' not found on daemon {1}.{2}".format( + counter, daemon_type, daemon_id)) + continue + value = perf_dump[subsys][counter_id] + + log.info("Daemon {0}.{1} {2}={3}".format( + daemon_type, daemon_id, counter, value + )) + + if value > 0: + seen.add(counter) + + if not dry_run: + unseen = set(counters) - set(seen) + if unseen: + raise RuntimeError("The following counters failed to be set " + "on {0} daemons: {1}".format( + daemon_type, unseen + )) + +task = CheckCounter diff --git a/src/common/config_opts.h b/src/common/config_opts.h index 9eb84cc1b9f..c9a9beefff6 100644 --- a/src/common/config_opts.h +++ b/src/common/config_opts.h @@ -527,7 +527,7 @@ OPTION(mds_log_max_expiring, OPT_INT, 20) OPTION(mds_bal_sample_interval, OPT_DOUBLE, 3.0) // every 3 seconds OPTION(mds_bal_replicate_threshold, OPT_FLOAT, 8000) OPTION(mds_bal_unreplicate_threshold, OPT_FLOAT, 0) -OPTION(mds_bal_frag, OPT_BOOL, false) +OPTION(mds_bal_frag, OPT_BOOL, true) OPTION(mds_bal_split_size, OPT_INT, 10000) OPTION(mds_bal_split_rd, OPT_FLOAT, 25000) OPTION(mds_bal_split_wr, OPT_FLOAT, 10000) diff --git a/src/include/ceph_fs.h b/src/include/ceph_fs.h index 2d1486d94c7..479999d06c0 100644 --- a/src/include/ceph_fs.h +++ b/src/include/ceph_fs.h @@ -239,6 +239,8 @@ struct ceph_mon_subscribe_ack { #define CEPH_MDSMAP_ALLOW_CLASSICS (CEPH_MDSMAP_ALLOW_SNAPS | CEPH_MDSMAP_ALLOW_MULTIMDS | \ CEPH_MDSMAP_ALLOW_DIRFRAGS) +#define CEPH_MDSMAP_DEFAULTS CEPH_MDSMAP_ALLOW_DIRFRAGS + /* * mds states * > 0 -> in diff --git a/src/mds/MDSMap.h b/src/mds/MDSMap.h index e1874459bfb..1d8e29b2a36 100644 --- a/src/mds/MDSMap.h +++ b/src/mds/MDSMap.h @@ -220,7 +220,7 @@ public: public: MDSMap() : epoch(0), enabled(false), fs_name(MDS_FS_NAME_DEFAULT), - flags(0), last_failure(0), + flags(CEPH_MDSMAP_DEFAULTS), last_failure(0), last_failure_osd_epoch(0), tableserver(0), root(0), session_timeout(0), diff --git a/src/mon/FSCommands.cc b/src/mon/FSCommands.cc index 6093b5dd4ff..21918032e55 100644 --- a/src/mon/FSCommands.cc +++ b/src/mon/FSCommands.cc @@ -379,12 +379,6 @@ public: }); ss << "disallowed new directory fragmentation"; } else { - string confirm; - if (!cmd_getval(g_ceph_context, cmdmap, "confirm", confirm) || - confirm != "--yes-i-really-mean-it") { - ss << EXPERIMENTAL_WARNING; - return -EPERM; - } fsmap.modify_filesystem( fs->fscid, [](std::shared_ptr fs)