ceph/tasks/cephfs/cephfs_test_case.py

import json
import logging
import unittest
from unittest import case
import time
import os
import re
from StringIO import StringIO

from tasks.cephfs.fuse_mount import FuseMount
from teuthology.orchestra import run
from teuthology.orchestra.run import CommandFailedError


log = logging.getLogger(__name__)


class CephFSTestCase(unittest.TestCase):
    """
    Test case for Ceph FS, requires caller to populate Filesystem and Mounts,
    into the fs, mount_a, mount_b class attributes (setting mount_b is optional)

    Handles resetting the cluster under test between tests.
    """
    # Environment references
    mounts = None
    fs = None
    ctx = None

    # FIXME weird explicit naming
    mount_a = None
    mount_b = None

    # Declarative test requirements: subclasses should override these to indicate
    # their special needs.  If not met, tests will be skipped.
    CLIENTS_REQUIRED = 1
    MDSS_REQUIRED = 1
    REQUIRE_KCLIENT_REMOTE = False
    REQUIRE_ONE_CLIENT_REMOTE = False

    LOAD_SETTINGS = []

    def setUp(self):
        if len(self.fs.mds_ids) < self.MDSS_REQUIRED:
            raise case.SkipTest("Only have {0} MDSs, require {1}".format(
                len(self.fs.mds_ids), self.MDSS_REQUIRED
            ))

        if len(self.mounts) < self.CLIENTS_REQUIRED:
            raise case.SkipTest("Only have {0} clients, require {1}".format(
                len(self.mounts), self.CLIENTS_REQUIRED
            ))

        if self.REQUIRE_KCLIENT_REMOTE:
            if not isinstance(self.mounts[0], FuseMount) or not isinstance(self.mounts[1], FuseMount):
                # kclient kill() power cycles nodes, so requires clients to each be on
                # their own node
                if self.mounts[0].client_remote.hostname == self.mounts[1].client_remote.hostname:
                    raise case.SkipTest("kclient clients must be on separate nodes")

        if self.REQUIRE_ONE_CLIENT_REMOTE:
            if self.mounts[0].client_remote.hostname in self.fs.get_mds_hostnames():
                raise case.SkipTest("Require first client to be on separate server from MDSs")

        # Unmount all surplus clients
        for i in range(self.CLIENTS_REQUIRED, len(self.mounts)):
            mount = self.mounts[i]
            log.info("Unmounting unneeded client {0}".format(mount.client_id))
            mount.umount_wait()

        # Create friendly mount_a, mount_b attrs
        for i in range(0, self.CLIENTS_REQUIRED):
            setattr(self, "mount_{0}".format(chr(ord('a') + i)), self.mounts[i])

        self.fs.clear_firewall()

        # Unmount in order to start each test on a fresh mount, such
        # that test_barrier can have a firm expectation of what OSD
        # epoch the clients start with.
        if self.mount_a.is_mounted():
            self.mount_a.umount_wait()

        if self.mount_b:
            if self.mount_b.is_mounted():
                self.mount_b.umount_wait()

        # To avoid any issues with e.g. unlink bugs, we destroy and recreate
        # the filesystem rather than just doing a rm -rf of files
        self.fs.mds_stop()
        if self.fs.exists():
            self.fs.mds_fail()
        self.fs.delete_all()
        self.fs.create()

        # In case the previous filesystem had filled up the RADOS cluster, wait for that
        # flag to pass.
        osd_mon_report_interval_max = int(self.fs.get_config("osd_mon_report_interval_max", service_type='osd'))
        self.wait_until_true(lambda: not self.fs.is_full(),
                             timeout=osd_mon_report_interval_max * 5)

        # In case anything is in the OSD blacklist list, clear it out.  This is to avoid
        # the OSD map changing in the background (due to blacklist expiry) while tests run.
        blacklist = json.loads(self.fs.mon_manager.raw_cluster_cmd("osd", "dump", "--format=json-pretty"))['blacklist']
        log.info("Removing {0} blacklist entries".format(len(blacklist)))
        for addr, blacklisted_at in blacklist.items():
            self.fs.mon_manager.raw_cluster_cmd("osd", "blacklist", "rm", addr)

        # In case some test messed with auth caps, reset them
        for mount in self.mounts:
            self.fs.mon_manager.raw_cluster_cmd_result(
                'auth', 'caps', "client.{0}".format(mount.client_id),
                'mds', 'allow',
                'mon', 'allow r',
                'osd', 'allow rw pool={0}'.format(self.fs.get_data_pool_name()))

        self.fs.mds_restart()
        self.fs.wait_for_daemons()
        if not self.mount_a.is_mounted():
            self.mount_a.mount()
            self.mount_a.wait_until_mounted()

        if self.mount_b:
            if not self.mount_b.is_mounted():
                self.mount_b.mount()
                self.mount_b.wait_until_mounted()

        # Load an config settings of interest
        for setting in self.LOAD_SETTINGS:
            setattr(self, setting, int(self.fs.mds_asok(
                ['config', 'get', setting], self.fs.mds_ids[0]
            )[setting]))

        self.configs_set = set()

    def tearDown(self):
        self.fs.clear_firewall()
        self.mount_a.teardown()
        if self.mount_b:
            self.mount_b.teardown()

        for subsys, key in self.configs_set:
            self.fs.clear_ceph_conf(subsys, key)

    def set_conf(self, subsys, key, value):
        self.configs_set.add((subsys, key))
        self.fs.set_ceph_conf(subsys, key, value)

    def assert_session_count(self, expected, ls_data=None, mds_id=None):
        if ls_data is None:
            ls_data = self.fs.mds_asok(['session', 'ls'], mds_id=mds_id)

        self.assertEqual(expected, len(ls_data), "Expected {0} sessions, found {1}".format(
            expected, len(ls_data)
        ))

    def assert_session_state(self, client_id,  expected_state):
        self.assertEqual(
            self._session_by_id(
                self.fs.mds_asok(['session', 'ls'])).get(client_id, {'state': None})['state'],
            expected_state)

    def get_session_data(self, client_id):
        return self._session_by_id(client_id)

    def _session_list(self):
        ls_data = self.fs.mds_asok(['session', 'ls'])
        ls_data = [s for s in ls_data if s['state'] not in ['stale', 'closed']]
        return ls_data

    def get_session(self, client_id, session_ls=None):
        if session_ls is None:
            session_ls = self.fs.mds_asok(['session', 'ls'])

        return self._session_by_id(session_ls)[client_id]

    def _session_by_id(self, session_ls):
        return dict([(s['id'], s) for s in session_ls])

    def wait_until_equal(self, get_fn, expect_val, timeout, reject_fn=None):
        period = 5
        elapsed = 0
        while True:
            val = get_fn()
            if val == expect_val:
                return
            elif reject_fn and reject_fn(val):
                raise RuntimeError("wait_until_equal: forbidden value {0} seen".format(val))
            else:
                if elapsed >= timeout:
                    raise RuntimeError("Timed out after {0} seconds waiting for {1} (currently {2})".format(
                        elapsed, expect_val, val
                    ))
                else:
                    log.debug("wait_until_equal: {0} != {1}, waiting...".format(val, expect_val))
                time.sleep(period)
                elapsed += period

        log.debug("wait_until_equal: success")

    def wait_until_true(self, condition, timeout):
        period = 5
        elapsed = 0
        while True:
            if condition():
                return
            else:
                if elapsed >= timeout:
                    raise RuntimeError("Timed out after {0} seconds".format(elapsed))
                else:
                    log.debug("wait_until_true: waiting...")
                time.sleep(period)
                elapsed += period

        log.debug("wait_until_true: success")

    def assert_mds_crash(self, daemon_id):
        """
        Assert that the a particular MDS daemon crashes (block until
        it does)
        """
        try:
            self.fs.mds_daemons[daemon_id].proc.wait()
        except CommandFailedError as e:
            log.info("MDS '{0}' crashed with status {1} as expected".format(daemon_id, e.exitstatus))
            self.fs.mds_daemons[daemon_id].proc = None

            # Go remove the coredump from the crash, otherwise teuthology.internal.coredump will
            # catch it later and treat it as a failure.
            p = self.fs.mds_daemons[daemon_id].remote.run(args=[
                "sudo", "sysctl", "-n", "kernel.core_pattern"], stdout=StringIO())
            core_pattern = p.stdout.getvalue().strip()
            if os.path.dirname(core_pattern):  # Non-default core_pattern with a directory in it
                # We have seen a core_pattern that looks like it's from teuthology's coredump
                # task, so proceed to clear out the core file
                log.info("Clearing core from pattern: {0}".format(core_pattern))

                # Determine the PID of the crashed MDS by inspecting the MDSMap, it had
                # to talk to the mons to get assigned a rank to reach the point of crashing
                addr = self.fs.mon_manager.get_mds_status(daemon_id)['addr']
                pid_str = addr.split("/")[1]
                log.info("Determined crasher PID was {0}".format(pid_str))

                # Substitute PID into core_pattern to get a glob
                core_glob = core_pattern.replace("%p", pid_str)
                core_glob = re.sub("%[a-z]", "*", core_glob)  # Match all for all other % tokens

                # Verify that we see the expected single coredump matching the expected pattern
                ls_proc = self.fs.mds_daemons[daemon_id].remote.run(args=[
                    "sudo", "ls", run.Raw(core_glob)
                ], stdout=StringIO())
                cores = [f for f in ls_proc.stdout.getvalue().strip().split("\n") if f]
                log.info("Enumerated cores: {0}".format(cores))
                self.assertEqual(len(cores), 1)

                log.info("Found core file {0}, deleting it".format(cores[0]))

                self.fs.mds_daemons[daemon_id].remote.run(args=[
                    "sudo", "rm", "-f", cores[0]
                ])
            else:
                log.info("No core_pattern directory set, nothing to clear (internal.coredump not enabled?)")

        else:
            raise AssertionError("MDS daemon '{0}' did not crash as expected".format(daemon_id))
tasks/cephfs: reset osd blacklist between tests ...to avoid OSDMap modifications happening in the background due to blacklist expiry. Fixes: #11301 Signed-off-by: John Spray <john.spray@redhat.com> 2015-05-22 10:01:55 +00:00			`import json`
tasks: generalise CephFSTestCase Some of this stuff could be even more general for embedding unittest-style suites, but for the moment let's keep the cephfs stuff in a walled garden. Signed-off-by: John Spray <john.spray@redhat.com> 2014-09-03 13:14:28 +00:00			`import logging`
			`import unittest`
tasks: generalise cephfs test runner ...to avoid having boilerplate in each test module, and gain the ability to run them all in one go with a nice test-by-test pass/fail report. Signed-off-by: John Spray <john.spray@redhat.com> 2015-03-26 17:52:10 +00:00			`from unittest import case`
tasks/cephfs: move wait_until helpers to testcase ...so that other test cases can use them Signed-off-by: John Spray <john.spray@redhat.com> 2014-12-18 12:50:16 +00:00			`import time`
tasks: update journal_repair test for 'damaged' state To track recent change in master where instead of crashing on missing MDSTable object we'll go into damaged state. Instead of catching a crash, handle the rank's transition to the damanged state. Leave the crash handling code (unused for the moment) in the Filesystem class in case it's needed elsewhere soon. Signed-off-by: John Spray <john.spray@redhat.com> 2015-03-26 17:15:28 +00:00			`import os`
			`import re`
			`from StringIO import StringIO`

tasks: generalise cephfs test runner ...to avoid having boilerplate in each test module, and gain the ability to run them all in one go with a nice test-by-test pass/fail report. Signed-off-by: John Spray <john.spray@redhat.com> 2015-03-26 17:52:10 +00:00			`from tasks.cephfs.fuse_mount import FuseMount`
tasks: update journal_repair test for 'damaged' state To track recent change in master where instead of crashing on missing MDSTable object we'll go into damaged state. Instead of catching a crash, handle the rank's transition to the damanged state. Leave the crash handling code (unused for the moment) in the Filesystem class in case it's needed elsewhere soon. Signed-off-by: John Spray <john.spray@redhat.com> 2015-03-26 17:15:28 +00:00			`from teuthology.orchestra import run`
			`from teuthology.orchestra.run import CommandFailedError`
tasks: generalise CephFSTestCase Some of this stuff could be even more general for embedding unittest-style suites, but for the moment let's keep the cephfs stuff in a walled garden. Signed-off-by: John Spray <john.spray@redhat.com> 2014-09-03 13:14:28 +00:00

			`log = logging.getLogger(__name__)`


			`class CephFSTestCase(unittest.TestCase):`
tasks/cephfs: move common setUp/tearDown to parent Now that we have more of these cases, there was lots of duplication in setup and teardown. For some tests the "reset everything" setup/teardown is overkill, but it's harmless. Signed-off-by: John Spray <john.spray@redhat.com> 2014-12-18 13:03:40 +00:00			`"""`
			`Test case for Ceph FS, requires caller to populate Filesystem and Mounts,`
			`into the fs, mount_a, mount_b class attributes (setting mount_b is optional)`

			`Handles resetting the cluster under test between tests.`
			`"""`
			`# Environment references`
tasks: generalise cephfs test runner ...to avoid having boilerplate in each test module, and gain the ability to run them all in one go with a nice test-by-test pass/fail report. Signed-off-by: John Spray <john.spray@redhat.com> 2015-03-26 17:52:10 +00:00			`mounts = None`
			`fs = None`
			`ctx = None`

			`# FIXME weird explicit naming`
tasks/cephfs: move common setUp/tearDown to parent Now that we have more of these cases, there was lots of duplication in setup and teardown. For some tests the "reset everything" setup/teardown is overkill, but it's harmless. Signed-off-by: John Spray <john.spray@redhat.com> 2014-12-18 13:03:40 +00:00			`mount_a = None`
			`mount_b = None`
tasks: generalise cephfs test runner ...to avoid having boilerplate in each test module, and gain the ability to run them all in one go with a nice test-by-test pass/fail report. Signed-off-by: John Spray <john.spray@redhat.com> 2015-03-26 17:52:10 +00:00
			`# Declarative test requirements: subclasses should override these to indicate`
			`# their special needs. If not met, tests will be skipped.`
			`CLIENTS_REQUIRED = 1`
			`MDSS_REQUIRED = 1`
			`REQUIRE_KCLIENT_REMOTE = False`
			`REQUIRE_ONE_CLIENT_REMOTE = False`

			`LOAD_SETTINGS = []`
tasks: generalise CephFSTestCase Some of this stuff could be even more general for embedding unittest-style suites, but for the moment let's keep the cephfs stuff in a walled garden. Signed-off-by: John Spray <john.spray@redhat.com> 2014-09-03 13:14:28 +00:00
tasks/cephfs: move common setUp/tearDown to parent Now that we have more of these cases, there was lots of duplication in setup and teardown. For some tests the "reset everything" setup/teardown is overkill, but it's harmless. Signed-off-by: John Spray <john.spray@redhat.com> 2014-12-18 13:03:40 +00:00			`def setUp(self):`
tasks: generalise cephfs test runner ...to avoid having boilerplate in each test module, and gain the ability to run them all in one go with a nice test-by-test pass/fail report. Signed-off-by: John Spray <john.spray@redhat.com> 2015-03-26 17:52:10 +00:00			`if len(self.fs.mds_ids) < self.MDSS_REQUIRED:`
			`raise case.SkipTest("Only have {0} MDSs, require {1}".format(`
			`len(self.fs.mds_ids), self.MDSS_REQUIRED`
			`))`

			`if len(self.mounts) < self.CLIENTS_REQUIRED:`
			`raise case.SkipTest("Only have {0} clients, require {1}".format(`
			`len(self.mounts), self.CLIENTS_REQUIRED`
			`))`

			`if self.REQUIRE_KCLIENT_REMOTE:`
			`if not isinstance(self.mounts[0], FuseMount) or not isinstance(self.mounts[1], FuseMount):`
			`# kclient kill() power cycles nodes, so requires clients to each be on`
			`# their own node`
			`if self.mounts[0].client_remote.hostname == self.mounts[1].client_remote.hostname:`
			`raise case.SkipTest("kclient clients must be on separate nodes")`

			`if self.REQUIRE_ONE_CLIENT_REMOTE:`
			`if self.mounts[0].client_remote.hostname in self.fs.get_mds_hostnames():`
			`raise case.SkipTest("Require first client to be on separate server from MDSs")`

			`# Unmount all surplus clients`
			`for i in range(self.CLIENTS_REQUIRED, len(self.mounts)):`
			`mount = self.mounts[i]`
			`log.info("Unmounting unneeded client {0}".format(mount.client_id))`
			`mount.umount_wait()`

			`# Create friendly mount_a, mount_b attrs`
			`for i in range(0, self.CLIENTS_REQUIRED):`
			`setattr(self, "mount_{0}".format(chr(ord('a') + i)), self.mounts[i])`

tasks/cephfs: move common setUp/tearDown to parent Now that we have more of these cases, there was lots of duplication in setup and teardown. For some tests the "reset everything" setup/teardown is overkill, but it's harmless. Signed-off-by: John Spray <john.spray@redhat.com> 2014-12-18 13:03:40 +00:00			`self.fs.clear_firewall()`

			`# Unmount in order to start each test on a fresh mount, such`
			`# that test_barrier can have a firm expectation of what OSD`
			`# epoch the clients start with.`
			`if self.mount_a.is_mounted():`
			`self.mount_a.umount_wait()`

			`if self.mount_b:`
			`if self.mount_b.is_mounted():`
			`self.mount_b.umount_wait()`

			`# To avoid any issues with e.g. unlink bugs, we destroy and recreate`
			`# the filesystem rather than just doing a rm -rf of files`
			`self.fs.mds_stop()`
tasks/cephfs: add Filesystem.exists Use this during test setup to check whether a filesystem is configured at all, before trying to tear it down. Signed-off-by: John Spray <john.spray@redhat.com> 2015-07-22 09:08:55 +00:00			`if self.fs.exists():`
			`self.fs.mds_fail()`
tasks/cephfs: add Filesystem.delete_all So that we can clear down filesysteems created by someone other than ourselves (like vstart.sh) Signed-off-by: John Spray <john.spray@redhat.com> 2015-07-21 15:30:38 +00:00			`self.fs.delete_all()`
tasks/cephfs: move common setUp/tearDown to parent Now that we have more of these cases, there was lots of duplication in setup and teardown. For some tests the "reset everything" setup/teardown is overkill, but it's harmless. Signed-off-by: John Spray <john.spray@redhat.com> 2014-12-18 13:03:40 +00:00			`self.fs.create()`

			`# In case the previous filesystem had filled up the RADOS cluster, wait for that`
			`# flag to pass.`
			`osd_mon_report_interval_max = int(self.fs.get_config("osd_mon_report_interval_max", service_type='osd'))`
			`self.wait_until_true(lambda: not self.fs.is_full(),`
			`timeout=osd_mon_report_interval_max * 5)`

tasks/cephfs: reset osd blacklist between tests ...to avoid OSDMap modifications happening in the background due to blacklist expiry. Fixes: #11301 Signed-off-by: John Spray <john.spray@redhat.com> 2015-05-22 10:01:55 +00:00			`# In case anything is in the OSD blacklist list, clear it out. This is to avoid`
			`# the OSD map changing in the background (due to blacklist expiry) while tests run.`
tasks/cephfs: fix typo in blacklist clearing Broken in aa0ffb3 Signed-off-by: John Spray <john.spray@redhat.com> 2015-05-29 12:33:54 +00:00			`blacklist = json.loads(self.fs.mon_manager.raw_cluster_cmd("osd", "dump", "--format=json-pretty"))['blacklist']`
tasks/cephfs: reset osd blacklist between tests ...to avoid OSDMap modifications happening in the background due to blacklist expiry. Fixes: #11301 Signed-off-by: John Spray <john.spray@redhat.com> 2015-05-22 10:01:55 +00:00			`log.info("Removing {0} blacklist entries".format(len(blacklist)))`
			`for addr, blacklisted_at in blacklist.items():`
			`self.fs.mon_manager.raw_cluster_cmd("osd", "blacklist", "rm", addr)`

tasks/cephfs: reset auth caps in setUp Signed-off-by: John Spray <john.spray@redhat.com> 2015-07-27 22:39:29 +00:00			`# In case some test messed with auth caps, reset them`
			`for mount in self.mounts:`
tasks/cephfs: mds allow Signed-off-by: Sage Weil <sage@redhat.com> 2015-09-28 16:38:04 +00:00			`self.fs.mon_manager.raw_cluster_cmd_result(`
			`'auth', 'caps', "client.{0}".format(mount.client_id),`
			`'mds', 'allow',`
			`'mon', 'allow r',`
			`'osd', 'allow rw pool={0}'.format(self.fs.get_data_pool_name()))`
tasks/cephfs: reset auth caps in setUp Signed-off-by: John Spray <john.spray@redhat.com> 2015-07-27 22:39:29 +00:00
tasks/cephfs: move common setUp/tearDown to parent Now that we have more of these cases, there was lots of duplication in setup and teardown. For some tests the "reset everything" setup/teardown is overkill, but it's harmless. Signed-off-by: John Spray <john.spray@redhat.com> 2014-12-18 13:03:40 +00:00			`self.fs.mds_restart()`
			`self.fs.wait_for_daemons()`
			`if not self.mount_a.is_mounted():`
			`self.mount_a.mount()`
			`self.mount_a.wait_until_mounted()`

			`if self.mount_b:`
			`if not self.mount_b.is_mounted():`
			`self.mount_b.mount()`
			`self.mount_b.wait_until_mounted()`

tasks: generalise cephfs test runner ...to avoid having boilerplate in each test module, and gain the ability to run them all in one go with a nice test-by-test pass/fail report. Signed-off-by: John Spray <john.spray@redhat.com> 2015-03-26 17:52:10 +00:00			`# Load an config settings of interest`
			`for setting in self.LOAD_SETTINGS:`
			`setattr(self, setting, int(self.fs.mds_asok(`
			`['config', 'get', setting], self.fs.mds_ids[0]`
			`)[setting]))`

tasks/cephfs: move common setUp/tearDown to parent Now that we have more of these cases, there was lots of duplication in setup and teardown. For some tests the "reset everything" setup/teardown is overkill, but it's harmless. Signed-off-by: John Spray <john.spray@redhat.com> 2014-12-18 13:03:40 +00:00			`self.configs_set = set()`

			`def tearDown(self):`
			`self.fs.clear_firewall()`
			`self.mount_a.teardown()`
			`if self.mount_b:`
			`self.mount_b.teardown()`

			`for subsys, key in self.configs_set:`
			`self.fs.clear_ceph_conf(subsys, key)`

			`def set_conf(self, subsys, key, value):`
			`self.configs_set.add((subsys, key))`
			`self.fs.set_ceph_conf(subsys, key, value)`

tasks/cephfs: better multiple-mds handling Signed-off-by: John Spray <john.spray@redhat.com> 2015-02-06 09:55:04 +00:00			`def assert_session_count(self, expected, ls_data=None, mds_id=None):`
tasks: generalise CephFSTestCase Some of this stuff could be even more general for embedding unittest-style suites, but for the moment let's keep the cephfs stuff in a walled garden. Signed-off-by: John Spray <john.spray@redhat.com> 2014-09-03 13:14:28 +00:00			`if ls_data is None:`
tasks/cephfs: better multiple-mds handling Signed-off-by: John Spray <john.spray@redhat.com> 2015-02-06 09:55:04 +00:00			`ls_data = self.fs.mds_asok(['session', 'ls'], mds_id=mds_id)`
tasks: generalise CephFSTestCase Some of this stuff could be even more general for embedding unittest-style suites, but for the moment let's keep the cephfs stuff in a walled garden. Signed-off-by: John Spray <john.spray@redhat.com> 2014-09-03 13:14:28 +00:00
			`self.assertEqual(expected, len(ls_data), "Expected {0} sessions, found {1}".format(`
			`expected, len(ls_data)`
			`))`

			`def assert_session_state(self, client_id, expected_state):`
			`self.assertEqual(`
			`self._session_by_id(`
			`self.fs.mds_asok(['session', 'ls'])).get(client_id, {'state': None})['state'],`
			`expected_state)`

			`def get_session_data(self, client_id):`
			`return self._session_by_id(client_id)`

			`def _session_list(self):`
			`ls_data = self.fs.mds_asok(['session', 'ls'])`
			`ls_data = [s for s in ls_data if s['state'] not in ['stale', 'closed']]`
			`return ls_data`

			`def get_session(self, client_id, session_ls=None):`
			`if session_ls is None:`
			`session_ls = self.fs.mds_asok(['session', 'ls'])`

			`return self._session_by_id(session_ls)[client_id]`

			`def _session_by_id(self, session_ls):`
			`return dict([(s['id'], s) for s in session_ls])`

tasks/cephfs: move wait_until helpers to testcase ...so that other test cases can use them Signed-off-by: John Spray <john.spray@redhat.com> 2014-12-18 12:50:16 +00:00			`def wait_until_equal(self, get_fn, expect_val, timeout, reject_fn=None):`
			`period = 5`
			`elapsed = 0`
			`while True:`
			`val = get_fn()`
			`if val == expect_val:`
			`return`
			`elif reject_fn and reject_fn(val):`
			`raise RuntimeError("wait_until_equal: forbidden value {0} seen".format(val))`
			`else:`
			`if elapsed >= timeout:`
			`raise RuntimeError("Timed out after {0} seconds waiting for {1} (currently {2})".format(`
			`elapsed, expect_val, val`
			`))`
			`else:`
			`log.debug("wait_until_equal: {0} != {1}, waiting...".format(val, expect_val))`
			`time.sleep(period)`
			`elapsed += period`

			`log.debug("wait_until_equal: success")`

			`def wait_until_true(self, condition, timeout):`
			`period = 5`
			`elapsed = 0`
			`while True:`
			`if condition():`
			`return`
			`else:`
			`if elapsed >= timeout:`
			`raise RuntimeError("Timed out after {0} seconds".format(elapsed))`
			`else:`
			`log.debug("wait_until_true: waiting...")`
			`time.sleep(period)`
			`elapsed += period`

			`log.debug("wait_until_true: success")`
tasks: update journal_repair test for 'damaged' state To track recent change in master where instead of crashing on missing MDSTable object we'll go into damaged state. Instead of catching a crash, handle the rank's transition to the damanged state. Leave the crash handling code (unused for the moment) in the Filesystem class in case it's needed elsewhere soon. Signed-off-by: John Spray <john.spray@redhat.com> 2015-03-26 17:15:28 +00:00
			`def assert_mds_crash(self, daemon_id):`
			`"""`
			`Assert that the a particular MDS daemon crashes (block until`
			`it does)`
			`"""`
			`try:`
			`self.fs.mds_daemons[daemon_id].proc.wait()`
			`except CommandFailedError as e:`
			`log.info("MDS '{0}' crashed with status {1} as expected".format(daemon_id, e.exitstatus))`
			`self.fs.mds_daemons[daemon_id].proc = None`

			`# Go remove the coredump from the crash, otherwise teuthology.internal.coredump will`
			`# catch it later and treat it as a failure.`
			`p = self.fs.mds_daemons[daemon_id].remote.run(args=[`
			`"sudo", "sysctl", "-n", "kernel.core_pattern"], stdout=StringIO())`
			`core_pattern = p.stdout.getvalue().strip()`
			`if os.path.dirname(core_pattern): # Non-default core_pattern with a directory in it`
			`# We have seen a core_pattern that looks like it's from teuthology's coredump`
			`# task, so proceed to clear out the core file`
			`log.info("Clearing core from pattern: {0}".format(core_pattern))`

			`# Determine the PID of the crashed MDS by inspecting the MDSMap, it had`
			`# to talk to the mons to get assigned a rank to reach the point of crashing`
			`addr = self.fs.mon_manager.get_mds_status(daemon_id)['addr']`
			`pid_str = addr.split("/")[1]`
			`log.info("Determined crasher PID was {0}".format(pid_str))`

			`# Substitute PID into core_pattern to get a glob`
			`core_glob = core_pattern.replace("%p", pid_str)`
			`core_glob = re.sub("%[a-z]", "*", core_glob) # Match all for all other % tokens`

			`# Verify that we see the expected single coredump matching the expected pattern`
			`ls_proc = self.fs.mds_daemons[daemon_id].remote.run(args=[`
			`"sudo", "ls", run.Raw(core_glob)`
			`], stdout=StringIO())`
			`cores = [f for f in ls_proc.stdout.getvalue().strip().split("\n") if f]`
			`log.info("Enumerated cores: {0}".format(cores))`
			`self.assertEqual(len(cores), 1)`

			`log.info("Found core file {0}, deleting it".format(cores[0]))`

			`self.fs.mds_daemons[daemon_id].remote.run(args=[`
			`"sudo", "rm", "-f", cores[0]`
			`])`
			`else:`
			`log.info("No core_pattern directory set, nothing to clear (internal.coredump not enabled?)")`

			`else:`
			`raise AssertionError("MDS daemon '{0}' did not crash as expected".format(daemon_id))`