ceph/qa/tasks/cephfs/test_fragment.py

317 lines
10 KiB
Python

from tasks.cephfs.cephfs_test_case import CephFSTestCase
from teuthology.orchestra import run
import os
import time
import logging
log = logging.getLogger(__name__)
class TestFragmentation(CephFSTestCase):
CLIENTS_REQUIRED = 1
MDSS_REQUIRED = 1
def get_splits(self):
return self.fs.mds_asok(['perf', 'dump', 'mds'])['mds']['dir_split']
def get_merges(self):
return self.fs.mds_asok(['perf', 'dump', 'mds'])['mds']['dir_merge']
def get_dir_ino(self, path):
dir_cache = self.fs.read_cache(path, 0)
dir_ino = None
dir_inono = self.mount_a.path_to_ino(path.strip("/"))
for ino in dir_cache:
if ino['ino'] == dir_inono:
dir_ino = ino
break
self.assertIsNotNone(dir_ino)
return dir_ino
def _configure(self, **kwargs):
"""
Apply kwargs as MDS configuration settings, enable dirfrags
and restart the MDSs.
"""
for k, v in kwargs.items():
self.ceph_cluster.set_ceph_conf("mds", k, v.__str__())
self.mds_cluster.mds_fail_restart()
self.fs.wait_for_daemons()
def test_oversize(self):
"""
That a directory is split when it becomes too large.
"""
split_size = 20
merge_size = 5
self._configure(
mds_bal_split_size=split_size,
mds_bal_merge_size=merge_size,
mds_bal_split_bits=1
)
self.assertEqual(self.get_splits(), 0)
self.mount_a.create_n_files("splitdir/file", split_size + 1)
self.wait_until_true(
lambda: self.get_splits() == 1,
timeout=30
)
frags = self.get_dir_ino("/splitdir")['dirfrags']
self.assertEqual(len(frags), 2)
self.assertEqual(frags[0]['dirfrag'], "0x10000000000.0*")
self.assertEqual(frags[1]['dirfrag'], "0x10000000000.1*")
self.assertEqual(
sum([len(f['dentries']) for f in frags]),
split_size + 1
)
self.assertEqual(self.get_merges(), 0)
self.mount_a.run_shell(["rm", "-f", run.Raw("splitdir/file*")])
self.wait_until_true(
lambda: self.get_merges() == 1,
timeout=30
)
self.assertEqual(len(self.get_dir_ino("/splitdir")["dirfrags"]), 1)
def test_rapid_creation(self):
"""
That the fast-splitting limit of 1.5x normal limit is
applied when creating dentries quickly.
"""
split_size = 100
merge_size = 1
self._configure(
mds_bal_split_size=split_size,
mds_bal_merge_size=merge_size,
mds_bal_split_bits=3,
mds_bal_fragment_size_max=int(split_size * 1.5 + 2)
)
# We test this only at a single split level. If a client was sending
# IO so fast that it hit a second split before the first split
# was complete, it could violate mds_bal_fragment_size_max -- there
# is a window where the child dirfrags of a split are unfrozen
# (so they can grow), but still have STATE_FRAGMENTING (so they
# can't be split).
# By writing 4x the split size when the split bits are set
# to 3 (i.e. 4-ways), I am reasonably sure to see precisely
# one split. The test is to check whether that split
# happens soon enough that the client doesn't exceed
# 2x the split_size (the "immediate" split mode should
# kick in at 1.5x the split size).
self.assertEqual(self.get_splits(), 0)
self.mount_a.create_n_files("splitdir/file", split_size * 4)
self.wait_until_equal(
self.get_splits,
1,
reject_fn=lambda s: s > 1,
timeout=30
)
def test_deep_split(self):
"""
That when the directory grows many times larger than split size,
the fragments get split again.
"""
split_size = 100
merge_size = 1 # i.e. don't merge frag unless its empty
split_bits = 1
branch_factor = 2**split_bits
# Arbitrary: how many levels shall we try fragmenting before
# ending the test?
max_depth = 5
self._configure(
mds_bal_split_size=split_size,
mds_bal_merge_size=merge_size,
mds_bal_split_bits=split_bits
)
# Each iteration we will create another level of fragments. The
# placement of dentries into fragments is by hashes (i.e. pseudo
# random), so we rely on statistics to get the behaviour that
# by writing about 1.5x as many dentries as the split_size times
# the number of frags, we will get them all to exceed their
# split size and trigger a split.
depth = 0
files_written = 0
splits_expected = 0
while depth < max_depth:
log.info("Writing files for depth {0}".format(depth))
target_files = branch_factor**depth * int(split_size * 1.5)
create_files = target_files - files_written
self.ceph_cluster.mon_manager.raw_cluster_cmd("log",
"{0} Writing {1} files (depth={2})".format(
self.__class__.__name__, create_files, depth
))
self.mount_a.create_n_files("splitdir/file_{0}".format(depth),
create_files)
self.ceph_cluster.mon_manager.raw_cluster_cmd("log",
"{0} Done".format(self.__class__.__name__))
files_written += create_files
log.info("Now have {0} files".format(files_written))
splits_expected += branch_factor**depth
log.info("Waiting to see {0} splits".format(splits_expected))
try:
self.wait_until_equal(
self.get_splits,
splits_expected,
timeout=30,
reject_fn=lambda x: x > splits_expected
)
frags = self.get_dir_ino("/splitdir")['dirfrags']
self.assertEqual(len(frags), branch_factor**(depth+1))
self.assertEqual(
sum([len(f['dentries']) for f in frags]),
target_files
)
except:
# On failures, log what fragmentation we actually ended
# up with. This block is just for logging, at the end
# we raise the exception again.
frags = self.get_dir_ino("/splitdir")['dirfrags']
log.info("depth={0} splits_expected={1} files_written={2}".format(
depth, splits_expected, files_written
))
log.info("Dirfrags:")
for f in frags:
log.info("{0}: {1}".format(
f['dirfrag'], len(f['dentries'])
))
raise
depth += 1
# Remember the inode number because we will be checking for
# objects later.
dir_inode_no = self.mount_a.path_to_ino("splitdir")
self.mount_a.run_shell(["rm", "-rf", "splitdir/"])
self.mount_a.umount_wait()
self.fs.mds_asok(['flush', 'journal'])
# Wait for all strays to purge
self.wait_until_equal(
lambda: self.fs.mds_asok(['perf', 'dump', 'mds_cache']
)['mds_cache']['num_strays'],
0,
timeout=1200
)
# Check that the metadata pool objects for all the myriad
# child fragments are gone
metadata_objs = self.fs.rados(["ls"])
frag_objs = []
for o in metadata_objs:
if o.startswith("{0:x}.".format(dir_inode_no)):
frag_objs.append(o)
self.assertListEqual(frag_objs, [])
def test_split_straydir(self):
"""
That stray dir is split when it becomes too large.
"""
def _count_fragmented():
mdsdir_cache = self.fs.read_cache("~mdsdir", 1)
num = 0
for ino in mdsdir_cache:
if ino["ino"] == 0x100:
continue
if len(ino["dirfrags"]) > 1:
log.info("straydir 0x{:X} is fragmented".format(ino["ino"]))
num += 1;
return num
split_size = 50
merge_size = 5
split_bits = 1
self._configure(
mds_bal_split_size=split_size,
mds_bal_merge_size=merge_size,
mds_bal_split_bits=split_bits,
mds_bal_fragment_size_max=(split_size * 100)
)
# manually split/merge
self.assertEqual(_count_fragmented(), 0)
self.fs.mds_asok(["dirfrag", "split", "~mdsdir/stray8", "0/0", "1"])
self.fs.mds_asok(["dirfrag", "split", "~mdsdir/stray9", "0/0", "1"])
self.wait_until_true(
lambda: _count_fragmented() == 2,
timeout=30
)
time.sleep(30)
self.fs.mds_asok(["dirfrag", "merge", "~mdsdir/stray8", "0/0"])
self.wait_until_true(
lambda: _count_fragmented() == 1,
timeout=30
)
time.sleep(30)
# auto merge
# merging stray dirs is driven by MDCache::advance_stray()
# advance stray dir 10 times
for _ in range(10):
self.fs.mds_asok(['flush', 'journal'])
self.wait_until_true(
lambda: _count_fragmented() == 0,
timeout=30
)
# auto split
# there are 10 stray dirs. advance stray dir 20 times
self.mount_a.create_n_files("testdir1/file", split_size * 20)
self.mount_a.run_shell(["mkdir", "testdir2"])
testdir1_path = os.path.join(self.mount_a.mountpoint, "testdir1")
for i in self.mount_a.ls(testdir1_path):
self.mount_a.run_shell(["ln", "testdir1/{0}".format(i), "testdir2/"])
self.mount_a.umount_wait()
self.mount_a.mount()
self.mount_a.wait_until_mounted()
# flush journal and restart mds. after restart, testdir2 is not in mds' cache
self.fs.mds_asok(['flush', 'journal'])
self.mds_cluster.mds_fail_restart()
self.fs.wait_for_daemons()
# splitting stray dirs is driven by MDCache::advance_stray()
# advance stray dir after unlink 'split_size' files.
self.fs.mds_asok(['config', 'set', 'mds_log_events_per_segment', str(split_size)])
self.assertEqual(_count_fragmented(), 0)
self.mount_a.run_shell(["rm", "-rf", "testdir1"])
self.wait_until_true(
lambda: _count_fragmented() > 0,
timeout=30
)