Merge pull request #285 from ceph/wip-9883

tasks/mds_journal_repair: create new test Reviewed-by: Greg Farnum <gfarnum@redhat.com>
2025-03-11 02:39:05 +00:00 · 2015-01-19 21:20:11 +00:00 · 2015-01-19 21:20:11 +00:00 · 1b2ec97256
commit 1b2ec97256
parent 11a8881cd1 2c1cef6b5d
3 changed files with 498 additions and 20 deletions
--- a/tasks/cephfs/filesystem.py
+++ b/tasks/cephfs/filesystem.py
@ -151,6 +151,21 @@ class Filesystem(object):

        return True

+    def get_active_names(self):
+        """
+        Return MDS daemon names of those daemons holding ranks
+        in state up:active
+
+        :return: list of strings like ['a', 'b'], sorted by rank
+        """
+        status = self.mon_manager.get_mds_status_all()
+        result = []
+        for mds_status in sorted(status['info'].values(), lambda a, b: cmp(a['rank'], b['rank'])):
+            if mds_status['state'] == 'up:active':
+                result.append(mds_status['name'])
+
+        return result
+
    def wait_for_daemons(self, timeout=None):
        """
        Wait until all daemons are healthy
@ -369,8 +384,8 @@ class Filesystem(object):
            elif timeout is not None and elapsed > timeout:
                raise RuntimeError(
                    "Reached timeout after {0} seconds waiting for state {1}, while in state {2}".format(
-                    elapsed, goal_state, current_state
-                ))
+                        elapsed, goal_state, current_state
+                    ))
            else:
                time.sleep(1)
                elapsed += 1
@ -420,6 +435,27 @@ class Filesystem(object):

        return json.loads(p.stdout.getvalue().strip())

+    def rados(self, args, pool=None):
+        """
+        Call into the `rados` CLI from an MDS
+        """
+
+        if pool is None:
+            pool = self.get_metadata_pool_name()
+
+        # Doesn't matter which MDS we use to run rados commands, they all
+        # have access to the pools
+        mds_id = self.mds_ids[0]
+        remote = self.mds_daemons[mds_id].remote
+
+        # NB we could alternatively use librados pybindings for this, but it's a one-liner
+        # using the `rados` CLI
+        args = ["rados", "-p", pool] + args
+        p = remote.run(
+            args=args,
+            stdout=StringIO())
+        return p.stdout.getvalue().strip()
+
    def list_dirfrag(self, dir_ino):
        """
        Read the named object and return the list of omap keys
@ -429,32 +465,82 @@ class Filesystem(object):

        dirfrag_obj_name = "{0:x}.00000000".format(dir_ino)

-        # Doesn't matter which MDS we use to run rados commands, they all
-        # have access to the pools
-        mds_id = self.mds_ids[0]
-        remote = self.mds_daemons[mds_id].remote
-
-        # NB we could alternatively use librados pybindings for this, but it's a one-liner
-        # using the `rados` CLI
-        args = ["rados", "-p", self.get_metadata_pool_name(), "listomapkeys", dirfrag_obj_name]
        try:
-            p = remote.run(
-                args=args,
-                stdout=StringIO())
+            key_list_str = self.rados(["listomapkeys", dirfrag_obj_name])
        except CommandFailedError as e:
            log.error(e.__str__())
            raise ObjectNotFound(dirfrag_obj_name)

-        key_list_str = p.stdout.getvalue().strip()
        return key_list_str.split("\n") if key_list_str else []

-    def journal_tool(self, args):
+    def erase_metadata_objects(self, prefix):
        """
-        Invoke cephfs-journal-tool with the passed arguments, and return its stdout
+        For all objects in the metadata pool matching the prefix,
+        erase them.
+
+        This O(N) with the number of objects in the pool, so only suitable
+        for use on toy test filesystems.
        """
+        all_objects = self.rados(["ls"]).split("\n")
+        matching_objects = [o for o in all_objects if o.startswith(prefix)]
+        for o in matching_objects:
+            self.rados(["rm", o])
+
+    def erase_mds_objects(self, rank):
+        """
+        Erase all the per-MDS objects for a particular rank.  This includes
+        inotable, sessiontable, journal
+        """
+
+        def obj_prefix(multiplier):
+            """
+            MDS object naming conventions like rank 1's
+            journal is at 201.***
+            """
+            return "%x." % (multiplier * 0x100 + rank)
+
+        # MDS_INO_LOG_OFFSET
+        self.erase_metadata_objects(obj_prefix(2))
+        # MDS_INO_LOG_BACKUP_OFFSET
+        self.erase_metadata_objects(obj_prefix(3))
+        # MDS_INO_LOG_POINTER_OFFSET
+        self.erase_metadata_objects(obj_prefix(4))
+        # MDSTables & SessionMap
+        self.erase_metadata_objects("mds{rank:d}_".format(rank=rank))
+
+    def _run_tool(self, tool, args, rank=None, quiet=False):
        mds_id = self.mds_ids[0]
        remote = self.mds_daemons[mds_id].remote

-        return remote.run(
-            args=['cephfs-journal-tool'] + args,
+        # Tests frequently have [client] configuration that jacks up
+        # the objecter log level (unlikely to be interesting here)
+        # and does not set the mds log level (very interesting here)
+        if quiet:
+            base_args = [tool, '--debug-mds=1', '--debug-objecter=1']
+        else:
+            base_args = [tool, '--debug-mds=4', '--debug-objecter=1']
+
+        if rank is not None:
+            base_args.extend(["--rank", "%d" % rank])
+
+        t1 = datetime.datetime.now()
+        r = remote.run(
+            args=base_args + args,
            stdout=StringIO()).stdout.getvalue().strip()
+        duration = datetime.datetime.now() - t1
+        log.info("Ran {0} in time {1}, result:\n{2}".format(
+            base_args + args, duration, r
+        ))
+        return r
+
+    def journal_tool(self, args, rank=None, quiet=False):
+        """
+        Invoke cephfs-journal-tool with the passed arguments, and return its stdout
+        """
+        return self._run_tool("cephfs-journal-tool", args, rank, quiet)
+
+    def table_tool(self, args, quiet=False):
+        """
+        Invoke cephfs-table-tool with the passed arguments, and return its stdout
+        """
+        return self._run_tool("cephfs-table-tool", args, None, quiet)
--- a/tasks/cephfs/mount.py
+++ b/tasks/cephfs/mount.py
@ -108,9 +108,9 @@ class CephFSMount(object):
        p = self._run_python(pyscript)
        p.wait()

-    def run_shell(self, args):
+    def run_shell(self, args, wait=True):
        args = ["cd", self.mountpoint, run.Raw('&&')] + args
-        return self.client_remote.run(args=args, stdout=StringIO())
+        return self.client_remote.run(args=args, stdout=StringIO(), wait=wait)

    def open_no_data(self, basename):
        """
@ -318,6 +318,36 @@ class CephFSMount(object):
                except (CommandFailedError, ConnectionLostError):
                    pass

+    def spam_dir_background(self, path):
+        """
+        Create directory `path` and do lots of metadata operations
+        in it until further notice.
+        """
+        assert(self.is_mounted())
+        abs_path = os.path.join(self.mountpoint, path)
+
+        pyscript = dedent("""
+            import sys
+            import time
+            import os
+
+            abs_path = "{abs_path}"
+
+            if not os.path.exists(abs_path):
+                os.makedirs(abs_path)
+
+            n = 0
+            while True:
+                file_path = os.path.join(abs_path, "tmp%d" % n)
+                f = open(file_path, 'w')
+                f.close()
+                n = n + 1
+            """).format(abs_path=abs_path)
+
+        rproc = self._run_python(pyscript)
+        self.background_procs.append(rproc)
+        return rproc
+
    def get_global_id(self):
        raise NotImplementedError()

--- a/tasks/mds_journal_repair.py
+++ b/tasks/mds_journal_repair.py
@ -0,0 +1,362 @@
+
+"""
+Test our tools for recovering the content of damaged journals
+"""
+
+import contextlib
+import json
+import logging
+from textwrap import dedent
+import time
+from teuthology.orchestra.run import CommandFailedError
+from tasks.cephfs.filesystem import Filesystem, ObjectNotFound, ROOT_INO
+from tasks.cephfs.cephfs_test_case import CephFSTestCase, run_tests
+
+
+log = logging.getLogger(__name__)
+
+
+class TestJournalRepair(CephFSTestCase):
+    def test_inject_to_empty(self):
+        """
+        That when some dentries in the journal but nothing is in
+        the backing store, we correctly populate the backing store
+        from the journalled dentries.
+        """
+
+        # Inject metadata operations
+        self.mount_a.run_shell(["touch", "rootfile"])
+        self.mount_a.run_shell(["mkdir", "subdir"])
+        self.mount_a.run_shell(["touch", "subdir/subdirfile"])
+        # There are several different paths for handling hardlinks, depending
+        # on whether an existing dentry (being overwritten) is also a hardlink
+        self.mount_a.run_shell(["mkdir", "linkdir"])
+
+        # Test inode -> remote transition for a dentry
+        self.mount_a.run_shell(["touch", "linkdir/link0"])
+        self.mount_a.run_shell(["rm", "-f", "linkdir/link0"])
+        self.mount_a.run_shell(["ln", "subdir/subdirfile", "linkdir/link0"])
+
+        # Test nothing -> remote transition
+        self.mount_a.run_shell(["ln", "subdir/subdirfile", "linkdir/link1"])
+
+        # Test remote -> inode transition
+        self.mount_a.run_shell(["ln", "subdir/subdirfile", "linkdir/link2"])
+        self.mount_a.run_shell(["rm", "-f", "linkdir/link2"])
+        self.mount_a.run_shell(["touch", "linkdir/link2"])
+
+        # Test remote -> diff remote transition
+        self.mount_a.run_shell(["ln", "subdir/subdirfile", "linkdir/link3"])
+        self.mount_a.run_shell(["rm", "-f", "linkdir/link3"])
+        self.mount_a.run_shell(["ln", "rootfile", "linkdir/link3"])
+
+        # Before we unmount, make a note of the inode numbers, later we will
+        # check that they match what we recover from the journal
+        rootfile_ino = self.mount_a.path_to_ino("rootfile")
+        subdir_ino = self.mount_a.path_to_ino("subdir")
+        linkdir_ino = self.mount_a.path_to_ino("linkdir")
+        subdirfile_ino = self.mount_a.path_to_ino("subdir/subdirfile")
+
+        self.mount_a.umount_wait()
+
+        # Stop the MDS
+        self.fs.mds_stop()
+        self.fs.mds_fail()
+
+        # Now, the journal should contain the operations, but the backing
+        # store shouldn't
+        with self.assertRaises(ObjectNotFound):
+            self.fs.list_dirfrag(subdir_ino)
+        self.assertEqual(self.fs.list_dirfrag(ROOT_INO), [])
+
+        # Execute the dentry recovery, this should populate the backing store
+        self.fs.journal_tool(['event', 'recover_dentries', 'list'])
+
+        # Dentries in ROOT_INO are present
+        self.assertEqual(sorted(self.fs.list_dirfrag(ROOT_INO)), sorted(['rootfile_head', 'subdir_head', 'linkdir_head']))
+        self.assertEqual(self.fs.list_dirfrag(subdir_ino), ['subdirfile_head'])
+        self.assertEqual(sorted(self.fs.list_dirfrag(linkdir_ino)),
+                         sorted(['link0_head', 'link1_head', 'link2_head', 'link3_head']))
+
+        # Now check the MDS can read what we wrote: truncate the journal
+        # and start the mds.
+        self.fs.journal_tool(['journal', 'reset'])
+        self.fs.mds_restart()
+        self.fs.wait_for_daemons()
+
+        # List files
+        self.mount_a.mount()
+        self.mount_a.wait_until_mounted()
+
+        # First ls -R to populate MDCache, such that hardlinks will
+        # resolve properly (recover_dentries does not create backtraces,
+        # so ordinarily hardlinks to inodes that happen not to have backtraces
+        # will be invisible in readdir).
+        # FIXME: hook in forward scrub here to regenerate backtraces
+        proc = self.mount_a.run_shell(['ls', '-R'])
+
+        proc = self.mount_a.run_shell(['ls', '-R'])
+        self.assertEqual(proc.stdout.getvalue().strip(),
+                         dedent("""
+                         .:
+                         linkdir
+                         rootfile
+                         subdir
+
+                         ./linkdir:
+                         link0
+                         link1
+                         link2
+                         link3
+
+                         ./subdir:
+                         subdirfile
+                         """).strip())
+
+        # Check the correct inos were preserved by path
+        self.assertEqual(rootfile_ino, self.mount_a.path_to_ino("rootfile"))
+        self.assertEqual(subdir_ino, self.mount_a.path_to_ino("subdir"))
+        self.assertEqual(subdirfile_ino, self.mount_a.path_to_ino("subdir/subdirfile"))
+
+        # Check that the hard link handling came out correctly
+        self.assertEqual(self.mount_a.path_to_ino("linkdir/link0"), subdirfile_ino)
+        self.assertEqual(self.mount_a.path_to_ino("linkdir/link1"), subdirfile_ino)
+        self.assertNotEqual(self.mount_a.path_to_ino("linkdir/link2"), subdirfile_ino)
+        self.assertEqual(self.mount_a.path_to_ino("linkdir/link3"), rootfile_ino)
+
+        # Create a new file, ensure it is not issued the same ino as one of the
+        # recovered ones
+        self.mount_a.run_shell(["touch", "afterwards"])
+        new_ino = self.mount_a.path_to_ino("afterwards")
+        self.assertNotIn(new_ino, [rootfile_ino, subdir_ino, subdirfile_ino])
+
+    def test_reset(self):
+        """
+        That after forcibly modifying the backing store, we can get back into
+        a good state by resetting the MDSMap.
+
+        The scenario is that we have two active MDSs, and we lose the journals.  Once
+        we have completely lost confidence in the integrity of the metadata, we want to
+        return the system to a single-MDS state to go into a scrub to recover what we
+        can.
+        """
+
+        # Set max_mds to 2
+        self.fs.mon_manager.raw_cluster_cmd_result('mds', 'set', "max_mds", "2")
+
+        # See that we have two active MDSs
+        self.wait_until_equal(lambda: len(self.fs.get_active_names()), 2, 30,
+                              reject_fn=lambda v: v > 2 or v < 1)
+        active_mds_names = self.fs.get_active_names()
+
+        # Do a bunch of I/O such that at least some will hit the second MDS: create
+        # lots of directories so that the balancer should find it easy to make a decision
+        # to allocate some of them to the second mds.
+        spammers = []
+        for n in range(0, 16):
+            dir_name = "spam_{0}".format(n)
+            spammers.append(self.mount_a.spam_dir_background(dir_name))
+
+        def subtrees_assigned():
+            got_subtrees = self.fs.mds_asok(["get", "subtrees"], mds_id=active_mds_names[0])
+            rank_1_count = len([s for s in got_subtrees if s['auth_first'] == 1])
+
+            # Greater than 1, because there is typically 1 for ~mds1, and once it
+            # has been assigned something in addition to that it means it has been
+            # assigned a "real" subtree.
+            return rank_1_count > 1
+
+        # We are waiting for the MDS to respond to hot directories, which
+        # is not guaranteed to happen at a particular time, so a lengthy timeout here.
+        self.wait_until_true(subtrees_assigned, 600)
+
+        # Flush the journals so that we have some backing store data
+        # belonging to one MDS, and some to the other MDS.
+        for mds_name in active_mds_names:
+            self.fs.mds_asok(["flush", "journal"], mds_name)
+
+        # Stop (hard) the second MDS daemon
+        self.fs.mds_stop(active_mds_names[1])
+
+        # Wipe out the tables for MDS rank 1 so that it is broken and can't start
+        # (this is the simulated failure that we will demonstrate that the disaster
+        #  recovery tools can get us back from)
+        self.fs.erase_metadata_objects(prefix="mds1_")
+
+        # Try to access files from the client
+        blocked_ls = self.mount_a.run_shell(["ls", "-R"], wait=False)
+
+        # Check that this "ls -R" blocked rather than completing: indicates
+        # it got stuck trying to access subtrees which were on the now-dead MDS.
+        log.info("Sleeping to check ls is blocked...")
+        time.sleep(60)
+        self.assertFalse(blocked_ls.finished)
+
+        # This mount is now useless because it will depend on MDS rank 1, and MDS rank 1
+        # is not coming back.  Kill it.
+        log.info("Killing mount, it's blocked on the MDS we killed")
+        self.mount_a.kill()
+        self.mount_a.kill_cleanup()
+        try:
+            # Now that the mount is dead, the ls -R should error out.
+            blocked_ls.wait()
+        except CommandFailedError:
+            pass
+
+        log.info("Terminating spammer processes...")
+        for spammer_proc in spammers:
+            spammer_proc.stdin.close()
+            try:
+                spammer_proc.wait()
+            except CommandFailedError:
+                pass
+
+        # See that the second MDS will crash when it starts and tries to
+        # acquire rank 1
+        self.fs.mds_restart(active_mds_names[1])
+        crasher = self.fs.mds_daemons[active_mds_names[1]].proc
+
+        try:
+            crasher.wait()
+        except CommandFailedError as e:
+            log.info("MDS '{0}' crashed with status {1} as expected".format(active_mds_names[1], e.exitstatus))
+            self.fs.mds_daemons[active_mds_names[1]].proc = None
+        else:
+            raise RuntimeError("MDS daemon '{0}' did not crash as expected".format(active_mds_names[1]))
+
+        # Now it's crashed, let the MDSMonitor know that it's not coming back
+        self.fs.mds_fail(active_mds_names[1])
+
+        # Now give up and go through a disaster recovery procedure
+        self.fs.mds_stop(active_mds_names[0])
+        self.fs.mds_fail(active_mds_names[0])
+        # Invoke recover_dentries quietly, because otherwise log spews millions of lines
+        self.fs.journal_tool(["event", "recover_dentries", "summary"], rank=0, quiet=True)
+        self.fs.journal_tool(["event", "recover_dentries", "summary"], rank=1, quiet=True)
+        self.fs.table_tool(["0", "reset", "session"])
+        self.fs.journal_tool(["journal", "reset"], rank=0)
+        self.fs.erase_mds_objects(1)
+        self.fs.mon_remote.run(args=['sudo', 'ceph', 'fs', 'reset', 'default', '--yes-i-really-mean-it'])
+
+        # Bring an MDS back online, mount a client, and see that we can walk the full
+        # filesystem tree again
+        self.fs.mds_restart(active_mds_names[0])
+        self.wait_until_equal(lambda: self.fs.get_active_names(), [active_mds_names[0]], 30,
+                              reject_fn=lambda v: len(v) > 1)
+        self.mount_a.mount()
+        self.mount_a.run_shell(["ls", "-R"], wait=True)
+
+    def test_table_tool(self):
+        active_mdss = self.fs.get_active_names()
+        self.assertEqual(len(active_mdss), 1)
+        mds_name = active_mdss[0]
+
+        self.mount_a.run_shell(["touch", "foo"])
+        self.fs.mds_asok(["flush", "journal"], mds_name)
+
+        log.info(self.fs.table_tool(["all", "show", "inode"]))
+        log.info(self.fs.table_tool(["all", "show", "snap"]))
+        log.info(self.fs.table_tool(["all", "show", "session"]))
+
+        # Inode table should always be the same because initial state
+        # and choice of inode are deterministic.
+        # Should see one inode consumed
+        self.assertEqual(
+            json.loads(self.fs.table_tool(["all", "show", "inode"])),
+            {"0": {
+                "data": {
+                    "version": 2,
+                    "inotable": {
+                        "projected_free": [
+                            {"start": 1099511628777,
+                             "len": 1099511626775}],
+                        "free": [
+                            {"start": 1099511628777,
+                             "len": 1099511626775}]}},
+                "result": 0}}
+
+        )
+
+        # Should see one session
+        session_data = json.loads(self.fs.table_tool(
+            ["all", "show", "session"]))
+        self.assertEqual(len(session_data["0"]["data"]["Sessions"]), 1)
+        self.assertEqual(session_data["0"]["result"], 0)
+
+        # Should see no snaps
+        self.assertEqual(
+            json.loads(self.fs.table_tool(["all", "show", "snap"])),
+            {"version": 0,
+             "snapserver": {"last_snap": 1,
+                            "pending_noop": [],
+                            "snaps": [],
+                            "need_to_purge": {},
+                            "pending_create": [],
+                            "pending_destroy": []},
+             "result": 0}
+        )
+
+        # Reset everything
+        for table in ["session", "inode", "snap"]:
+            self.fs.table_tool(["all", "reset", table])
+
+        log.info(self.fs.table_tool(["all", "show", "inode"]))
+        log.info(self.fs.table_tool(["all", "show", "snap"]))
+        log.info(self.fs.table_tool(["all", "show", "session"]))
+
+        # Should see 0 sessions
+        session_data = json.loads(self.fs.table_tool(
+            ["all", "show", "session"]))
+        self.assertEqual(len(session_data["0"]["data"]["Sessions"]), 0)
+        self.assertEqual(session_data["0"]["result"], 0)
+
+        # Should see entire inode range now marked free
+        self.assertEqual(
+            json.loads(self.fs.table_tool(["all", "show", "inode"])),
+            {"0": {"data": {"version": 1,
+                            "inotable": {"projected_free": [
+                                {"start": 1099511627776,
+                                 "len": 1099511627776}],
+                                 "free": [
+                                    {"start": 1099511627776,
+                                    "len": 1099511627776}]}},
+                   "result": 0}}
+        )
+
+        # Should see no snaps
+        self.assertEqual(
+            json.loads(self.fs.table_tool(["all", "show", "snap"])),
+            {"version": 1,
+             "snapserver": {"last_snap": 1,
+                            "pending_noop": [],
+                            "snaps": [],
+                            "need_to_purge": {},
+                            "pending_create": [],
+                            "pending_destroy": []},
+             "result": 0}
+        )
+
+
+@contextlib.contextmanager
+def task(ctx, config):
+    fs = Filesystem(ctx)
+
+    # Pick out the clients we will use from the configuration
+    # =======================================================
+    if len(ctx.mounts) < 1:
+        raise RuntimeError("Need at least one clients")
+    mount_a = ctx.mounts.values()[0]
+
+    # Stash references on ctx so that we can easily debug in interactive mode
+    # =======================================================================
+    ctx.filesystem = fs
+    ctx.mount_a = mount_a
+
+    run_tests(ctx, config, TestJournalRepair, {
+        'fs': fs,
+        'mount_a': mount_a
+    })
+
+    # Continue to any downstream tasks
+    # ================================
+    yield