ceph/qa/tasks/cephfs/test_damage.py

from io import BytesIO, StringIO
import json
import logging
import errno
import re
from teuthology.contextutil import MaxWhileTries
from teuthology.exceptions import CommandFailedError
from teuthology.orchestra.run import wait
from tasks.cephfs.fuse_mount import FuseMount
from tasks.cephfs.cephfs_test_case import CephFSTestCase, for_teuthology

DAMAGED_ON_START = "damaged_on_start"
DAMAGED_ON_LS = "damaged_on_ls"
CRASHED = "server crashed"
NO_DAMAGE = "no damage"
READONLY = "readonly"
FAILED_CLIENT = "client failed"
FAILED_SERVER = "server failed"

# An EIO in response to a stat from the client
EIO_ON_LS = "eio"

# An EIO, but nothing in damage table (not ever what we expect)
EIO_NO_DAMAGE = "eio without damage entry"


log = logging.getLogger(__name__)


class TestDamage(CephFSTestCase):
    def _simple_workload_write(self):
        self.mount_a.run_shell(["mkdir", "subdir"])
        self.mount_a.write_n_mb("subdir/sixmegs", 6)
        return self.mount_a.stat("subdir/sixmegs")

    def is_marked_damaged(self, rank):
        mds_map = self.fs.get_mds_map()
        return rank in mds_map['damaged']

    @for_teuthology #459s
    def test_object_deletion(self):
        """
        That the MDS has a clean 'damaged' response to loss of any single metadata object
        """

        self._simple_workload_write()

        # Hmm, actually it would be nice to permute whether the metadata pool
        # state contains sessions or not, but for the moment close this session
        # to avoid waiting through reconnect on every MDS start.
        self.mount_a.umount_wait()
        for mds_name in self.fs.get_active_names():
            self.fs.mds_asok(["flush", "journal"], mds_name)

        self.fs.fail()

        serialized = self.fs.radosmo(['export', '-'])

        def is_ignored(obj_id, dentry=None):
            """
            A filter to avoid redundantly mutating many similar objects (e.g.
            stray dirfrags) or similar dentries (e.g. stray dir dentries)
            """
            if re.match("60.\.00000000", obj_id) and obj_id != "600.00000000":
                return True

            if dentry and obj_id == "100.00000000":
                if re.match("stray.+_head", dentry) and dentry != "stray0_head":
                    return True

            return False

        def get_path(obj_id, dentry=None):
            """
            What filesystem path does this object or dentry correspond to?   i.e.
            what should I poke to see EIO after damaging it?
            """

            if obj_id == "1.00000000" and dentry == "subdir_head":
                return "./subdir"
            elif obj_id == "10000000000.00000000" and dentry == "sixmegs_head":
                return "./subdir/sixmegs"

            # None means ls will do an "ls -R" in hope of seeing some errors
            return None

        objects = self.fs.radosmo(["ls"], stdout=StringIO()).strip().split("\n")
        objects = [o for o in objects if not is_ignored(o)]

        # Find all objects with an OMAP header
        omap_header_objs = []
        for o in objects:
            header = self.fs.radosmo(["getomapheader", o], stdout=StringIO())
            # The rados CLI wraps the header output in a hex-printed style
            header_bytes = int(re.match("header \((.+) bytes\)", header).group(1))
            if header_bytes > 0:
                omap_header_objs.append(o)

        # Find all OMAP key/vals
        omap_keys = []
        for o in objects:
            keys_str = self.fs.radosmo(["listomapkeys", o], stdout=StringIO())
            if keys_str:
                for key in keys_str.strip().split("\n"):
                    if not is_ignored(o, key):
                        omap_keys.append((o, key))

        # Find objects that have data in their bodies
        data_objects = []
        for obj_id in objects:
            stat_out = self.fs.radosmo(["stat", obj_id], stdout=StringIO())
            size = int(re.match(".+, size (.+)$", stat_out).group(1))
            if size > 0:
                data_objects.append(obj_id)

        # Define the various forms of damage we will inflict
        class MetadataMutation(object):
            def __init__(self, obj_id_, desc_, mutate_fn_, expectation_, ls_path=None):
                self.obj_id = obj_id_
                self.desc = desc_
                self.mutate_fn = mutate_fn_
                self.expectation = expectation_
                if ls_path is None:
                    self.ls_path = "."
                else:
                    self.ls_path = ls_path

            def __eq__(self, other):
                return self.desc == other.desc

            def __hash__(self):
                return hash(self.desc)

        junk = "deadbeef" * 10
        mutations = []

        # Removals
        for o in objects:
            if o in [
                # JournalPointers are auto-replaced if missing (same path as upgrade)
                "400.00000000",
                # Missing dirfrags for non-system dirs result in empty directory
                "10000000000.00000000",
                # PurgeQueue is auto-created if not found on startup
                "500.00000000",
                # open file table is auto-created if not found on startup
                "mds0_openfiles.0"
            ]:
                expectation = NO_DAMAGE
            else:
                expectation = DAMAGED_ON_START

            log.info("Expectation on rm '{0}' will be '{1}'".format(
                o, expectation
            ))

            mutations.append(MetadataMutation(
                o,
                "Delete {0}".format(o),
                lambda o=o: self.fs.radosm(["rm", o]),
                expectation
            ))

        # Blatant corruptions
        for obj_id in data_objects:
            if obj_id == "500.00000000":
                # purge queue corruption results in read-only FS
                mutations.append(MetadataMutation(
                    obj_id,
                    "Corrupt {0}".format(obj_id),
                    lambda o=obj_id: self.fs.radosm(["put", o, "-"], stdin=StringIO(junk)),
                    READONLY
                ))
            else:
                mutations.append(MetadataMutation(
                    obj_id,
                    "Corrupt {0}".format(obj_id),
                    lambda o=obj_id: self.fs.radosm(["put", o, "-"], stdin=StringIO(junk)),
                    DAMAGED_ON_START
                ))

        # Truncations
        for o in data_objects:
            if o == "500.00000000":
                # The PurgeQueue is allowed to be empty: Journaler interprets
                # an empty header object as an empty journal.
                expectation = NO_DAMAGE
            else:
                expectation = DAMAGED_ON_START

            mutations.append(
                MetadataMutation(
                    o,
                    "Truncate {0}".format(o),
                    lambda o=o: self.fs.radosm(["truncate", o, "0"]),
                    expectation
            ))

        # OMAP value corruptions
        for o, k in omap_keys:
            if o.startswith("100."):
                # Anything in rank 0's 'mydir'
                expectation = DAMAGED_ON_START
            else:
                expectation = EIO_ON_LS

            mutations.append(
                MetadataMutation(
                    o,
                    "Corrupt omap key {0}:{1}".format(o, k),
                    lambda o=o,k=k: self.fs.radosm(["setomapval", o, k, junk]),
                    expectation,
                    get_path(o, k)
                )
            )

        # OMAP header corruptions
        for o in omap_header_objs:
            if re.match("60.\.00000000", o) \
                    or o in ["1.00000000", "100.00000000", "mds0_sessionmap"]:
                expectation = DAMAGED_ON_START
            else:
                expectation = NO_DAMAGE

            log.info("Expectation on corrupt header '{0}' will be '{1}'".format(
                o, expectation
            ))

            mutations.append(
                MetadataMutation(
                    o,
                    "Corrupt omap header on {0}".format(o),
                    lambda o=o: self.fs.radosm(["setomapheader", o, junk]),
                    expectation
                )
            )

        results = {}

        for mutation in mutations:
            log.info("Applying mutation '{0}'".format(mutation.desc))

            # Reset MDS state
            self.mount_a.umount_wait(force=True)
            self.fs.fail()
            self.fs.mon_manager.raw_cluster_cmd('mds', 'repaired', '0')

            # Reset RADOS pool state
            self.fs.radosm(['import', '-'], stdin=BytesIO(serialized))

            # Inject the mutation
            mutation.mutate_fn()

            # Try starting the MDS
            self.fs.set_joinable()

            # How long we'll wait between starting a daemon and expecting
            # it to make it through startup, and potentially declare itself
            # damaged to the mon cluster.
            startup_timeout = 60

            if mutation.expectation not in (EIO_ON_LS, DAMAGED_ON_LS, NO_DAMAGE):
                if mutation.expectation == DAMAGED_ON_START:
                    # The MDS may pass through active before making it to damaged
                    try:
                        self.wait_until_true(lambda: self.is_marked_damaged(0), startup_timeout)
                    except RuntimeError:
                        pass

                # Wait for MDS to either come up or go into damaged state
                try:
                    self.wait_until_true(lambda: self.is_marked_damaged(0) or self.fs.are_daemons_healthy(), startup_timeout)
                except RuntimeError:
                    crashed = False
                    # Didn't make it to healthy or damaged, did it crash?
                    for daemon_id, daemon in self.fs.mds_daemons.items():
                        if daemon.proc and daemon.proc.finished:
                            crashed = True
                            log.error("Daemon {0} crashed!".format(daemon_id))
                            daemon.proc = None  # So that subsequent stop() doesn't raise error
                    if not crashed:
                        # Didn't go health, didn't go damaged, didn't crash, so what?
                        raise
                    else:
                        log.info("Result: Mutation '{0}' led to crash".format(mutation.desc))
                        results[mutation] = CRASHED
                        continue
                if self.is_marked_damaged(0):
                    log.info("Result: Mutation '{0}' led to DAMAGED state".format(mutation.desc))
                    results[mutation] = DAMAGED_ON_START
                    continue
                else:
                    log.info("Mutation '{0}' did not prevent MDS startup, attempting ls...".format(mutation.desc))
            else:
                try:
                    self.wait_until_true(self.fs.are_daemons_healthy, 60)
                except RuntimeError:
                    log.info("Result: Mutation '{0}' should have left us healthy, actually not.".format(mutation.desc))
                    if self.is_marked_damaged(0):
                        results[mutation] = DAMAGED_ON_START
                    else:
                        results[mutation] = FAILED_SERVER
                    continue
                log.info("Daemons came up after mutation '{0}', proceeding to ls".format(mutation.desc))

            # MDS is up, should go damaged on ls or client mount
            self.mount_a.mount_wait()
            if mutation.ls_path == ".":
                proc = self.mount_a.run_shell(["ls", "-R", mutation.ls_path], wait=False)
            else:
                proc = self.mount_a.stat(mutation.ls_path, wait=False)

            if mutation.expectation == DAMAGED_ON_LS:
                try:
                    self.wait_until_true(lambda: self.is_marked_damaged(0), 60)
                    log.info("Result: Mutation '{0}' led to DAMAGED state after ls".format(mutation.desc))
                    results[mutation] = DAMAGED_ON_LS
                except RuntimeError:
                    if self.fs.are_daemons_healthy():
                        log.error("Result: Failed to go damaged on mutation '{0}', actually went active".format(
                            mutation.desc))
                        results[mutation] = NO_DAMAGE
                    else:
                        log.error("Result: Failed to go damaged on mutation '{0}'".format(mutation.desc))
                        results[mutation] = FAILED_SERVER
            elif mutation.expectation == READONLY:
                proc = self.mount_a.run_shell(["mkdir", "foo"], wait=False)
                try:
                    proc.wait()
                except CommandFailedError:
                    stderr = proc.stderr.getvalue()
                    log.info(stderr)
                    if "Read-only file system".lower() in stderr.lower():
                        pass
                    else:
                        raise
            else:
                try:
                    wait([proc], 20)
                    log.info("Result: Mutation '{0}' did not caused DAMAGED state".format(mutation.desc))
                    results[mutation] = NO_DAMAGE
                except MaxWhileTries:
                    log.info("Result: Failed to complete client IO on mutation '{0}'".format(mutation.desc))
                    results[mutation] = FAILED_CLIENT
                except CommandFailedError as e:
                    if e.exitstatus == errno.EIO:
                        log.info("Result: EIO on client")
                        results[mutation] = EIO_ON_LS
                    else:
                        log.info("Result: unexpected error {0} on client".format(e))
                        results[mutation] = FAILED_CLIENT

            if mutation.expectation == EIO_ON_LS:
                # EIOs mean something handled by DamageTable: assert that it has
                # been populated
                damage = json.loads(
                    self.fs.mon_manager.raw_cluster_cmd(
                        'tell', 'mds.{0}'.format(self.fs.get_active_names()[0]), "damage", "ls", '--format=json-pretty'))
                if len(damage) == 0:
                    results[mutation] = EIO_NO_DAMAGE

        failures = [(mutation, result) for (mutation, result) in results.items() if mutation.expectation != result]
        if failures:
            log.error("{0} mutations had unexpected outcomes:".format(len(failures)))
            for mutation, result in failures:
                log.error("  Expected '{0}' actually '{1}' from '{2}'".format(
                    mutation.expectation, result, mutation.desc
                ))
            raise RuntimeError("{0} mutations had unexpected outcomes".format(len(failures)))
        else:
            log.info("All {0} mutations had expected outcomes".format(len(mutations)))

    def test_damaged_dentry(self):
        # Damage to dentrys is interesting because it leaves the
        # directory's `complete` flag in a subtle state where
        # we have marked the dir complete in order that folks
        # can access it, but in actual fact there is a dentry
        # missing
        self.mount_a.run_shell(["mkdir", "subdir/"])

        self.mount_a.run_shell(["touch", "subdir/file_undamaged"])
        self.mount_a.run_shell(["touch", "subdir/file_to_be_damaged"])

        subdir_ino = self.mount_a.path_to_ino("subdir")

        self.mount_a.umount_wait()
        for mds_name in self.fs.get_active_names():
            self.fs.mds_asok(["flush", "journal"], mds_name)

        self.fs.fail()

        # Corrupt a dentry
        junk = "deadbeef" * 10
        dirfrag_obj = "{0:x}.00000000".format(subdir_ino)
        self.fs.radosm(["setomapval", dirfrag_obj, "file_to_be_damaged_head", junk])

        # Start up and try to list it
        self.fs.set_joinable()
        self.fs.wait_for_daemons()

        self.mount_a.mount_wait()
        dentries = self.mount_a.ls("subdir/")

        # The damaged guy should have disappeared
        self.assertEqual(dentries, ["file_undamaged"])

        # I should get ENOENT if I try and read it normally, because
        # the dir is considered complete
        try:
            self.mount_a.stat("subdir/file_to_be_damaged", wait=True)
        except CommandFailedError as e:
            self.assertEqual(e.exitstatus, errno.ENOENT)
        else:
            raise AssertionError("Expected ENOENT")

        # The fact that there is damaged should have bee recorded
        damage = json.loads(
            self.fs.mon_manager.raw_cluster_cmd(
                'tell', 'mds.{0}'.format(self.fs.get_active_names()[0]),
                "damage", "ls", '--format=json-pretty'))
        self.assertEqual(len(damage), 1)
        damage_id = damage[0]['id']

        # If I try to create a dentry with the same name as the damaged guy
        # then that should be forbidden
        try:
            self.mount_a.touch("subdir/file_to_be_damaged")
        except CommandFailedError as e:
            self.assertEqual(e.exitstatus, errno.EIO)
        else:
            raise AssertionError("Expected EIO")

        # Attempting that touch will clear the client's complete flag, now
        # when I stat it I'll get EIO instead of ENOENT
        try:
            self.mount_a.stat("subdir/file_to_be_damaged", wait=True)
        except CommandFailedError as e:
            if isinstance(self.mount_a, FuseMount):
                self.assertEqual(e.exitstatus, errno.EIO)
            else:
                # Old kernel client handles this case differently
                self.assertIn(e.exitstatus, [errno.ENOENT, errno.EIO])
        else:
            raise AssertionError("Expected EIO")

        nfiles = self.mount_a.getfattr("./subdir", "ceph.dir.files")
        self.assertEqual(nfiles, "2")

        self.mount_a.umount_wait()

        # Now repair the stats
        scrub_json = self.fs.run_scrub(["start", "/subdir", "repair"])
        log.info(json.dumps(scrub_json, indent=2))

        self.assertNotEqual(scrub_json, None)
        self.assertEqual(scrub_json["return_code"], 0)
        self.assertEqual(self.fs.wait_until_scrub_complete(tag=scrub_json["scrub_tag"]), True)

        # Check that the file count is now correct
        self.mount_a.mount_wait()
        nfiles = self.mount_a.getfattr("./subdir", "ceph.dir.files")
        self.assertEqual(nfiles, "1")

        # Clean up the omap object
        self.fs.radosm(["setomapval", dirfrag_obj, "file_to_be_damaged_head", junk])

        # Clean up the damagetable entry
        self.fs.mon_manager.raw_cluster_cmd(
            'tell', 'mds.{0}'.format(self.fs.get_active_names()[0]),
            "damage", "rm", "{did}".format(did=damage_id))

        # Now I should be able to create a file with the same name as the
        # damaged guy if I want.
        self.mount_a.touch("subdir/file_to_be_damaged")

    def test_open_ino_errors(self):
        """
        That errors encountered during opening inos are properly propagated
        """

        self.mount_a.run_shell(["mkdir", "dir1"])
        self.mount_a.run_shell(["touch", "dir1/file1"])
        self.mount_a.run_shell(["mkdir", "dir2"])
        self.mount_a.run_shell(["touch", "dir2/file2"])
        self.mount_a.run_shell(["mkdir", "testdir"])
        self.mount_a.run_shell(["ln", "dir1/file1", "testdir/hardlink1"])
        self.mount_a.run_shell(["ln", "dir2/file2", "testdir/hardlink2"])

        file1_ino = self.mount_a.path_to_ino("dir1/file1")
        file2_ino = self.mount_a.path_to_ino("dir2/file2")
        dir2_ino = self.mount_a.path_to_ino("dir2")

        # Ensure everything is written to backing store
        self.mount_a.umount_wait()
        self.fs.mds_asok(["flush", "journal"])

        # Drop everything from the MDS cache
        self.fs.fail()
        self.fs.journal_tool(['journal', 'reset'], 0)
        self.fs.set_joinable()
        self.fs.wait_for_daemons()

        self.mount_a.mount_wait()

        # Case 1: un-decodeable backtrace

        # Validate that the backtrace is present and decodable
        self.fs.read_backtrace(file1_ino)
        # Go corrupt the backtrace of alpha/target (used for resolving
        # bravo/hardlink).
        self.fs._write_data_xattr(file1_ino, "parent", "rhubarb")

        # Check that touching the hardlink gives EIO
        ran = self.mount_a.run_shell(["stat", "testdir/hardlink1"], wait=False)
        try:
            ran.wait()
        except CommandFailedError:
            self.assertTrue("Input/output error" in ran.stderr.getvalue())

        # Check that an entry is created in the damage table
        damage = json.loads(
            self.fs.mon_manager.raw_cluster_cmd(
                'tell', 'mds.{0}'.format(self.fs.get_active_names()[0]),
                "damage", "ls", '--format=json-pretty'))
        self.assertEqual(len(damage), 1)
        self.assertEqual(damage[0]['damage_type'], "backtrace")
        self.assertEqual(damage[0]['ino'], file1_ino)

        self.fs.mon_manager.raw_cluster_cmd(
            'tell', 'mds.{0}'.format(self.fs.get_active_names()[0]),
            "damage", "rm", str(damage[0]['id']))


        # Case 2: missing dirfrag for the target inode

        self.fs.radosm(["rm", "{0:x}.00000000".format(dir2_ino)])

        # Check that touching the hardlink gives EIO
        ran = self.mount_a.run_shell(["stat", "testdir/hardlink2"], wait=False)
        try:
            ran.wait()
        except CommandFailedError:
            self.assertTrue("Input/output error" in ran.stderr.getvalue())

        # Check that an entry is created in the damage table
        damage = json.loads(
            self.fs.mon_manager.raw_cluster_cmd(
                'tell', 'mds.{0}'.format(self.fs.get_active_names()[0]),
                "damage", "ls", '--format=json-pretty'))
        self.assertEqual(len(damage), 2)
        if damage[0]['damage_type'] == "backtrace" :
            self.assertEqual(damage[0]['ino'], file2_ino)
            self.assertEqual(damage[1]['damage_type'], "dir_frag")
            self.assertEqual(damage[1]['ino'], dir2_ino)
        else:
            self.assertEqual(damage[0]['damage_type'], "dir_frag")
            self.assertEqual(damage[0]['ino'], dir2_ino)
            self.assertEqual(damage[1]['damage_type'], "backtrace")
            self.assertEqual(damage[1]['ino'], file2_ino)

        for entry in damage:
            self.fs.mon_manager.raw_cluster_cmd(
                'tell', 'mds.{0}'.format(self.fs.get_active_names()[0]),
                "damage", "rm", str(entry['id']))