ceph/qa/tasks/cephfs/test_scrub.py

"""
Test CephFS scrub (distinct from OSD scrub) functionality
"""

from io import BytesIO
import logging
from collections import namedtuple

from tasks.cephfs.cephfs_test_case import CephFSTestCase

log = logging.getLogger(__name__)

ValidationError = namedtuple("ValidationError", ["exception", "backtrace"])


class Workload(CephFSTestCase):
    def __init__(self, test, filesystem, mount):
        super().__init__()
        self._test =  test
        self._mount = mount
        self._filesystem = filesystem
        self._initial_state = None

        # Accumulate backtraces for every failed validation, and return them.  Backtraces
        # are rather verbose, but we only see them when something breaks, and they
        # let us see which check failed without having to decorate each check with
        # a string
        self._errors = []

    def write(self):
        """
        Write the workload files to the mount
        """
        raise NotImplementedError()

    def validate(self):
        """
        Read from the mount and validate that the workload files are present (i.e. have
        survived or been reconstructed from the test scenario)
        """
        raise NotImplementedError()

    def damage(self):
        """
        Damage the filesystem pools in ways that will be interesting to recover from.  By
        default just wipe everything in the metadata pool
        """
        # Delete every object in the metadata pool
        pool = self._filesystem.get_metadata_pool_name()
        self._filesystem.rados(["purge", pool, '--yes-i-really-really-mean-it'])

    def flush(self):
        """
        Called after client unmount, after write: flush whatever you want
        """
        self._filesystem.mds_asok(["flush", "journal"])


class BacktraceWorkload(Workload):
    """
    Single file, single directory, wipe the backtrace and check it.
    """
    def write(self):
        self._mount.run_shell(["mkdir", "subdir"])
        self._mount.write_n_mb("subdir/sixmegs", 6)

    def validate(self):
        st = self._mount.stat("subdir/sixmegs")
        self._filesystem.mds_asok(["flush", "journal"])
        bt = self._filesystem.read_backtrace(st['st_ino'])
        parent = bt['ancestors'][0]['dname']
        self.assertEqual(parent, 'sixmegs')
        return self._errors

    def damage(self):
        st = self._mount.stat("subdir/sixmegs")
        self._filesystem.mds_asok(["flush", "journal"])
        self._filesystem._write_data_xattr(st['st_ino'], "parent", "")

    def create_files(self, nfiles=1000):
        self._mount.create_n_files("scrub-new-files/file", nfiles)


class DupInodeWorkload(Workload):
    """
    Duplicate an inode and try scrubbing it twice."
    """

    def write(self):
        self._mount.run_shell(["mkdir", "parent"])
        self._mount.run_shell(["mkdir", "parent/child"])
        self._mount.write_n_mb("parent/parentfile", 6)
        self._mount.write_n_mb("parent/child/childfile", 6)

    def damage(self):
        self._mount.umount_wait()
        self._filesystem.mds_asok(["flush", "journal"])
        self._filesystem.fail()
        d = self._filesystem.radosmo(["getomapval", "10000000000.00000000", "parentfile_head", "-"])
        self._filesystem.radosm(["setomapval", "10000000000.00000000", "shadow_head"], stdin=BytesIO(d))
        self._test.config_set('mds', 'mds_hack_allow_loading_invalid_metadata', True)
        self._filesystem.set_joinable()
        self._filesystem.wait_for_daemons()

    def validate(self):
        out_json = self._filesystem.run_scrub(["start", "/", "recursive", "repair"])
        self.assertNotEqual(out_json, None)
        self.assertEqual(out_json["return_code"], 0)
        self.assertEqual(self._filesystem.wait_until_scrub_complete(tag=out_json["scrub_tag"]), True)
        self.assertTrue(self._filesystem.are_daemons_healthy())
        return self._errors


class TestScrub(CephFSTestCase):
    MDSS_REQUIRED = 1

    def setUp(self):
        super().setUp()

    def _scrub(self, workload, workers=1):
        """
        That when all objects in metadata pool are removed, we can rebuild a metadata pool
        based on the contents of a data pool, and a client can see and read our files.
        """

        # First, inject some files

        workload.write()

        # are off by default, but in QA we need to explicitly disable them)
        self.fs.set_ceph_conf('mds', 'mds verify scatter', False)
        self.fs.set_ceph_conf('mds', 'mds debug scatterstat', False)

        # Apply any data damage the workload wants
        workload.damage()

        out_json = self.fs.run_scrub(["start", "/", "recursive", "repair"])
        self.assertNotEqual(out_json, None)
        self.assertEqual(out_json["return_code"], 0)
        self.assertEqual(self.fs.wait_until_scrub_complete(tag=out_json["scrub_tag"]), True)

        # See that the files are present and correct
        errors = workload.validate()
        if errors:
            log.error("Validation errors found: {0}".format(len(errors)))
            for e in errors:
                log.error(e.exception)
                log.error(e.backtrace)
            raise AssertionError("Validation failed, first error: {0}\n{1}".format(
                errors[0].exception, errors[0].backtrace
            ))

    def _get_damage_count(self, damage_type='backtrace'):
        out_json = self.fs.rank_tell(["damage", "ls"])
        self.assertNotEqual(out_json, None)

        damage_count = 0
        for it in out_json:
            if it['damage_type'] == damage_type:
                damage_count += 1
        return damage_count

    def _scrub_new_files(self, workload):
        """
        That scrubbing new files does not lead to errors
        """
        workload.create_files(1000)
        self.fs.wait_until_scrub_complete()
        self.assertEqual(self._get_damage_count(), 0)

    def test_scrub_backtrace_for_new_files(self):
        self._scrub_new_files(BacktraceWorkload(self, self.fs, self.mount_a))

    def test_scrub_backtrace(self):
        self._scrub(BacktraceWorkload(self, self.fs, self.mount_a))

    def test_scrub_dup_inode(self):
        self._scrub(DupInodeWorkload(self, self.fs, self.mount_a))
qa/cephfs: Add tests to validate scrub functionality Add tests to ensure the scrub operation is not adversly affected by certain metadata pathologies. Signed-off-by: Douglas Fuller <dfuller@redhat.com> 2018-01-24 16:11:40 +00:00			`"""`
			`Test CephFS scrub (distinct from OSD scrub) functionality`
			`"""`
qa: reduce dependence on teuthology role list for mds It's not yet possible to completely remove the dependency on mds_ids/mds_daemons in the CephFS tests but this commit reduces it enough for most code paths to work with cephadm. The main change here is use of CephManager.do_rados, with some improvements. Signed-off-by: Patrick Donnelly <pdonnell@redhat.com> 2021-03-10 22:46:27 +00:00
			`from io import BytesIO`
qa/cephfs: Add tests to validate scrub functionality Add tests to ensure the scrub operation is not adversly affected by certain metadata pathologies. Signed-off-by: Douglas Fuller <dfuller@redhat.com> 2018-01-24 16:11:40 +00:00			`import logging`
Correct usage of collections.abc Some classes should still be imported directly from collections; only OrderedDict, Iterable and Callable (in the context of the ceph codebase) are found in collections.abc. The current code works due to the fallback support for Python 2. Signed-off-by: James Page <james.page@ubuntu.com> 2018-11-29 09:47:07 +00:00			`from collections import namedtuple`
qa/cephfs: Add tests to validate scrub functionality Add tests to ensure the scrub operation is not adversly affected by certain metadata pathologies. Signed-off-by: Douglas Fuller <dfuller@redhat.com> 2018-01-24 16:11:40 +00:00
qa: Enable flake8 tox and fix failures There were a couple of problems found by flake8 in the qa/ directory (most of them fixed now). Enabling flake8 during the usual check runs hopefully avoids adding new issues in the future. Signed-off-by: Thomas Bechtold <tbechtold@suse.com> 2019-12-09 15:17:23 +00:00			`from tasks.cephfs.cephfs_test_case import CephFSTestCase`
qa/cephfs: Add tests to validate scrub functionality Add tests to ensure the scrub operation is not adversly affected by certain metadata pathologies. Signed-off-by: Douglas Fuller <dfuller@redhat.com> 2018-01-24 16:11:40 +00:00
			`log = logging.getLogger(__name__)`

			`ValidationError = namedtuple("ValidationError", ["exception", "backtrace"])`


test: subclass test_scrub.Workload from CephFSTestCase This allows usage of assert*() calls instead of writing our own version. Signed-off-by: Venky Shankar <vshankar@redhat.com> 2018-07-17 06:06:44 +00:00			`class Workload(CephFSTestCase):`
qa: simplify tests which stop MDS ranks Instead of stopping MDS daemons and individually failing MDS daemons, just fail the ranks or the entire file system, where possible. Signed-off-by: Patrick Donnelly <pdonnell@redhat.com> 2021-03-11 19:06:23 +00:00			`def __init__(self, test, filesystem, mount):`
qa/tasks/cephfs/test_scrub: fix self.assertEqual no attribute '_type_equality_funcs' Signed-off-by: Xiubo Li <xiubli@redhat.com> 2020-04-23 05:08:04 +00:00			`super().__init__()`
qa: simplify tests which stop MDS ranks Instead of stopping MDS daemons and individually failing MDS daemons, just fail the ranks or the entire file system, where possible. Signed-off-by: Patrick Donnelly <pdonnell@redhat.com> 2021-03-11 19:06:23 +00:00			`self._test = test`
qa/cephfs: Add tests to validate scrub functionality Add tests to ensure the scrub operation is not adversly affected by certain metadata pathologies. Signed-off-by: Douglas Fuller <dfuller@redhat.com> 2018-01-24 16:11:40 +00:00			`self._mount = mount`
			`self._filesystem = filesystem`
			`self._initial_state = None`

			`# Accumulate backtraces for every failed validation, and return them. Backtraces`
			`# are rather verbose, but we only see them when something breaks, and they`
			`# let us see which check failed without having to decorate each check with`
			`# a string`
			`self._errors = []`

			`def write(self):`
			`"""`
			`Write the workload files to the mount`
			`"""`
			`raise NotImplementedError()`

			`def validate(self):`
			`"""`
			`Read from the mount and validate that the workload files are present (i.e. have`
			`survived or been reconstructed from the test scenario)`
			`"""`
			`raise NotImplementedError()`

			`def damage(self):`
			`"""`
			`Damage the filesystem pools in ways that will be interesting to recover from. By`
			`default just wipe everything in the metadata pool`
			`"""`
			`# Delete every object in the metadata pool`
qa: reduce dependence on teuthology role list for mds It's not yet possible to completely remove the dependency on mds_ids/mds_daemons in the CephFS tests but this commit reduces it enough for most code paths to work with cephadm. The main change here is use of CephManager.do_rados, with some improvements. Signed-off-by: Patrick Donnelly <pdonnell@redhat.com> 2021-03-10 22:46:27 +00:00			`pool = self._filesystem.get_metadata_pool_name()`
			`self._filesystem.rados(["purge", pool, '--yes-i-really-really-mean-it'])`
qa/cephfs: Add tests to validate scrub functionality Add tests to ensure the scrub operation is not adversly affected by certain metadata pathologies. Signed-off-by: Douglas Fuller <dfuller@redhat.com> 2018-01-24 16:11:40 +00:00
			`def flush(self):`
			`"""`
			`Called after client unmount, after write: flush whatever you want`
			`"""`
			`self._filesystem.mds_asok(["flush", "journal"])`


			`class BacktraceWorkload(Workload):`
			`"""`
			`Single file, single directory, wipe the backtrace and check it.`
			`"""`
			`def write(self):`
			`self._mount.run_shell(["mkdir", "subdir"])`
			`self._mount.write_n_mb("subdir/sixmegs", 6)`

			`def validate(self):`
			`st = self._mount.stat("subdir/sixmegs")`
			`self._filesystem.mds_asok(["flush", "journal"])`
			`bt = self._filesystem.read_backtrace(st['st_ino'])`
			`parent = bt['ancestors'][0]['dname']`
test: subclass test_scrub.Workload from CephFSTestCase This allows usage of assert*() calls instead of writing our own version. Signed-off-by: Venky Shankar <vshankar@redhat.com> 2018-07-17 06:06:44 +00:00			`self.assertEqual(parent, 'sixmegs')`
qa/cephfs: Add tests to validate scrub functionality Add tests to ensure the scrub operation is not adversly affected by certain metadata pathologies. Signed-off-by: Douglas Fuller <dfuller@redhat.com> 2018-01-24 16:11:40 +00:00			`return self._errors`

			`def damage(self):`
			`st = self._mount.stat("subdir/sixmegs")`
			`self._filesystem.mds_asok(["flush", "journal"])`
			`self._filesystem._write_data_xattr(st['st_ino'], "parent", "")`

mds: flag backtrace scrub failures for new files as okay New, unwritten files, fail when backtracing during scrub. This is not necessarily bad. So flag such failures as okay and continue with other entries. Fixes: https://tracker.ceph.com/issues/43543 Signed-off-by: Milind Changire <mchangir@redhat.com> 2020-06-02 02:11:39 +00:00			`def create_files(self, nfiles=1000):`
			`self._mount.create_n_files("scrub-new-files/file", nfiles)`

qa/cephfs: Add tests to validate scrub functionality Add tests to ensure the scrub operation is not adversly affected by certain metadata pathologies. Signed-off-by: Douglas Fuller <dfuller@redhat.com> 2018-01-24 16:11:40 +00:00
			`class DupInodeWorkload(Workload):`
			`"""`
			`Duplicate an inode and try scrubbing it twice."`
			`"""`

			`def write(self):`
			`self._mount.run_shell(["mkdir", "parent"])`
			`self._mount.run_shell(["mkdir", "parent/child"])`
			`self._mount.write_n_mb("parent/parentfile", 6)`
			`self._mount.write_n_mb("parent/child/childfile", 6)`

			`def damage(self):`
qa/tasks/cephfs/test_scrub.py: use umount_wait to avoid ceph-fuse stuck If the ceph-fuse client need to flush the caps and does sync wait, the umount() will just return successfully, then the netns container will be destroyed and the network will not be reachable, but the ceph-fuse daemon is still stucked and waiting for the flush caps ack. This will cause the ceph-fuse daemon get stuck forever and if the mds daemons get restarted, it will try to reconnect the clients, but the stucked ceph-fuse daemnon won't reply to it, because it is not reachable any more. Fixes: https://tracker.ceph.com/issues/45665 Signed-off-by: Xiubo Li <xiubli@redhat.com> 2020-06-01 01:57:24 +00:00			`self._mount.umount_wait()`
qa/cephfs: Add tests to validate scrub functionality Add tests to ensure the scrub operation is not adversly affected by certain metadata pathologies. Signed-off-by: Douglas Fuller <dfuller@redhat.com> 2018-01-24 16:11:40 +00:00			`self._filesystem.mds_asok(["flush", "journal"])`
qa: simplify tests which stop MDS ranks Instead of stopping MDS daemons and individually failing MDS daemons, just fail the ranks or the entire file system, where possible. Signed-off-by: Patrick Donnelly <pdonnell@redhat.com> 2021-03-11 19:06:23 +00:00			`self._filesystem.fail()`
qa: reduce dependence on teuthology role list for mds It's not yet possible to completely remove the dependency on mds_ids/mds_daemons in the CephFS tests but this commit reduces it enough for most code paths to work with cephadm. The main change here is use of CephManager.do_rados, with some improvements. Signed-off-by: Patrick Donnelly <pdonnell@redhat.com> 2021-03-10 22:46:27 +00:00			`d = self._filesystem.radosmo(["getomapval", "10000000000.00000000", "parentfile_head", "-"])`
			`self._filesystem.radosm(["setomapval", "10000000000.00000000", "shadow_head"], stdin=BytesIO(d))`
qa: simplify tests which stop MDS ranks Instead of stopping MDS daemons and individually failing MDS daemons, just fail the ranks or the entire file system, where possible. Signed-off-by: Patrick Donnelly <pdonnell@redhat.com> 2021-03-11 19:06:23 +00:00			`self._test.config_set('mds', 'mds_hack_allow_loading_invalid_metadata', True)`
			`self._filesystem.set_joinable()`
qa/cephfs: Add tests to validate scrub functionality Add tests to ensure the scrub operation is not adversly affected by certain metadata pathologies. Signed-off-by: Douglas Fuller <dfuller@redhat.com> 2018-01-24 16:11:40 +00:00			`self._filesystem.wait_for_daemons()`

			`def validate(self):`
qa: add run_scrub helper in filesystem class Signed-off-by: Xiubo Li <xiubli@redhat.com> 2020-12-23 03:31:09 +00:00			`out_json = self._filesystem.run_scrub(["start", "/", "recursive", "repair"])`
test: validate empty json output during recursive scrub Signed-off-by: Venky Shankar <vshankar@redhat.com> Fixes: http://tracker.ceph.com/issues/23958 2018-07-13 11:39:12 +00:00			`self.assertNotEqual(out_json, None)`
qa: wait the scrub task to complete Signed-off-by: Xiubo Li <xiubli@redhat.com> 2020-12-23 01:54:52 +00:00			`self.assertEqual(out_json["return_code"], 0)`
			`self.assertEqual(self._filesystem.wait_until_scrub_complete(tag=out_json["scrub_tag"]), True)`
test: subclass test_scrub.Workload from CephFSTestCase This allows usage of assert*() calls instead of writing our own version. Signed-off-by: Venky Shankar <vshankar@redhat.com> 2018-07-17 06:06:44 +00:00			`self.assertTrue(self._filesystem.are_daemons_healthy())`
qa/cephfs: Add tests to validate scrub functionality Add tests to ensure the scrub operation is not adversly affected by certain metadata pathologies. Signed-off-by: Douglas Fuller <dfuller@redhat.com> 2018-01-24 16:11:40 +00:00			`return self._errors`


			`class TestScrub(CephFSTestCase):`
			`MDSS_REQUIRED = 1`

qa/tasks/cephfs/test_scrub: fix self.assertEqual no attribute '_type_equality_funcs' Signed-off-by: Xiubo Li <xiubli@redhat.com> 2020-04-23 05:08:04 +00:00			`def setUp(self):`
			`super().setUp()`

qa/cephfs: Add tests to validate scrub functionality Add tests to ensure the scrub operation is not adversly affected by certain metadata pathologies. Signed-off-by: Douglas Fuller <dfuller@redhat.com> 2018-01-24 16:11:40 +00:00			`def _scrub(self, workload, workers=1):`
			`"""`
			`That when all objects in metadata pool are removed, we can rebuild a metadata pool`
			`based on the contents of a data pool, and a client can see and read our files.`
			`"""`

			`# First, inject some files`

			`workload.write()`

			`# are off by default, but in QA we need to explicitly disable them)`
			`self.fs.set_ceph_conf('mds', 'mds verify scatter', False)`
			`self.fs.set_ceph_conf('mds', 'mds debug scatterstat', False)`

			`# Apply any data damage the workload wants`
			`workload.damage()`

qa: add run_scrub helper in filesystem class Signed-off-by: Xiubo Li <xiubli@redhat.com> 2020-12-23 03:31:09 +00:00			`out_json = self.fs.run_scrub(["start", "/", "recursive", "repair"])`
test: validate empty json output during recursive scrub Signed-off-by: Venky Shankar <vshankar@redhat.com> Fixes: http://tracker.ceph.com/issues/23958 2018-07-13 11:39:12 +00:00			`self.assertNotEqual(out_json, None)`
qa: wait the scrub task to complete Signed-off-by: Xiubo Li <xiubli@redhat.com> 2020-12-23 01:54:52 +00:00			`self.assertEqual(out_json["return_code"], 0)`
			`self.assertEqual(self.fs.wait_until_scrub_complete(tag=out_json["scrub_tag"]), True)`
qa/cephfs: Add tests to validate scrub functionality Add tests to ensure the scrub operation is not adversly affected by certain metadata pathologies. Signed-off-by: Douglas Fuller <dfuller@redhat.com> 2018-01-24 16:11:40 +00:00
			`# See that the files are present and correct`
			`errors = workload.validate()`
			`if errors:`
			`log.error("Validation errors found: {0}".format(len(errors)))`
			`for e in errors:`
			`log.error(e.exception)`
			`log.error(e.backtrace)`
			`raise AssertionError("Validation failed, first error: {0}\n{1}".format(`
			`errors[0].exception, errors[0].backtrace`
			`))`

mds: flag backtrace scrub failures for new files as okay New, unwritten files, fail when backtracing during scrub. This is not necessarily bad. So flag such failures as okay and continue with other entries. Fixes: https://tracker.ceph.com/issues/43543 Signed-off-by: Milind Changire <mchangir@redhat.com> 2020-06-02 02:11:39 +00:00			`def _get_damage_count(self, damage_type='backtrace'):`
			`out_json = self.fs.rank_tell(["damage", "ls"])`
			`self.assertNotEqual(out_json, None)`

			`damage_count = 0`
			`for it in out_json:`
			`if it['damage_type'] == damage_type:`
			`damage_count += 1`
			`return damage_count`

			`def _scrub_new_files(self, workload):`
			`"""`
			`That scrubbing new files does not lead to errors`
			`"""`
			`workload.create_files(1000)`
qa: wait the scrub task to complete Signed-off-by: Xiubo Li <xiubli@redhat.com> 2020-12-23 01:54:52 +00:00			`self.fs.wait_until_scrub_complete()`
mds: flag backtrace scrub failures for new files as okay New, unwritten files, fail when backtracing during scrub. This is not necessarily bad. So flag such failures as okay and continue with other entries. Fixes: https://tracker.ceph.com/issues/43543 Signed-off-by: Milind Changire <mchangir@redhat.com> 2020-06-02 02:11:39 +00:00			`self.assertEqual(self._get_damage_count(), 0)`

			`def test_scrub_backtrace_for_new_files(self):`
qa: simplify tests which stop MDS ranks Instead of stopping MDS daemons and individually failing MDS daemons, just fail the ranks or the entire file system, where possible. Signed-off-by: Patrick Donnelly <pdonnell@redhat.com> 2021-03-11 19:06:23 +00:00			`self._scrub_new_files(BacktraceWorkload(self, self.fs, self.mount_a))`
mds: flag backtrace scrub failures for new files as okay New, unwritten files, fail when backtracing during scrub. This is not necessarily bad. So flag such failures as okay and continue with other entries. Fixes: https://tracker.ceph.com/issues/43543 Signed-off-by: Milind Changire <mchangir@redhat.com> 2020-06-02 02:11:39 +00:00
qa/cephfs: Add tests to validate scrub functionality Add tests to ensure the scrub operation is not adversly affected by certain metadata pathologies. Signed-off-by: Douglas Fuller <dfuller@redhat.com> 2018-01-24 16:11:40 +00:00			`def test_scrub_backtrace(self):`
qa: simplify tests which stop MDS ranks Instead of stopping MDS daemons and individually failing MDS daemons, just fail the ranks or the entire file system, where possible. Signed-off-by: Patrick Donnelly <pdonnell@redhat.com> 2021-03-11 19:06:23 +00:00			`self._scrub(BacktraceWorkload(self, self.fs, self.mount_a))`
qa/cephfs: Add tests to validate scrub functionality Add tests to ensure the scrub operation is not adversly affected by certain metadata pathologies. Signed-off-by: Douglas Fuller <dfuller@redhat.com> 2018-01-24 16:11:40 +00:00
			`def test_scrub_dup_inode(self):`
qa: simplify tests which stop MDS ranks Instead of stopping MDS daemons and individually failing MDS daemons, just fail the ranks or the entire file system, where possible. Signed-off-by: Patrick Donnelly <pdonnell@redhat.com> 2021-03-11 19:06:23 +00:00			`self._scrub(DupInodeWorkload(self, self.fs, self.mount_a))`