Merge pull request #296 from ceph/wip-standby-mds

Wip standby mds
2025-02-19 17:08:05 +00:00 · 2014-07-25 16:44:34 -06:00 · 2014-07-25 16:44:34 -06:00 · 2999a4d492
commit 2999a4d492
parent 883f2a76f4 2ed1131fc9
5 changed files with 235 additions and 123 deletions
--- a/teuthology/task/cephfs/filesystem.py
+++ b/teuthology/task/cephfs/filesystem.py
@ -5,12 +5,16 @@ import logging
 import time
 from teuthology import misc
 from teuthology.parallel import parallel
 from teuthology.task import ceph_manager
 log = logging.getLogger(__name__)
 DAEMON_WAIT_TIMEOUT = 120
 class Filesystem(object):
    """
    This object is for driving a CephFS filesystem.
@ -23,51 +27,112 @@ class Filesystem(object):
        self._ctx = ctx
        self._config = config
-        mds_list = list(misc.all_roles_of_type(ctx.cluster, 'mds'))
+        self.mds_ids = list(misc.all_roles_of_type(ctx.cluster, 'mds'))
-        if len(mds_list) != 1:
+        if len(self.mds_ids) == 0:
-            # Require exactly one MDS, the code path for creation failure when
+            raise RuntimeError("This task requires at least one MDS")
            # a standby is available is different
            raise RuntimeError("This task requires exactly one MDS")
-        self.mds_id = mds_list[0]
+        first_mon = misc.get_first_mon(ctx, config)
-
+        (mon_remote,) = ctx.cluster.only(first_mon).remotes.iterkeys()
-        (mds_remote,) = ctx.cluster.only('mds.{_id}'.format(_id=self.mds_id)).remotes.iterkeys()
+        self.mon_manager = ceph_manager.CephManager(mon_remote, ctx=ctx, logger=log.getChild('ceph_manager'))
-        manager = ceph_manager.CephManager(
+        self.mds_daemons = dict([(mds_id, self._ctx.daemons.get_daemon('mds', mds_id)) for mds_id in self.mds_ids])
            mds_remote, ctx=ctx, logger=log.getChild('ceph_manager'),
        )
        self.mds_manager = manager
        client_list = list(misc.all_roles_of_type(self._ctx.cluster, 'client'))
        self.client_id = client_list[0]
        self.client_remote = list(misc.get_clients(ctx=ctx, roles=["client.{0}".format(self.client_id)]))[0][1]
-    def mds_stop(self):
+    def are_daemons_healthy(self):
        """
-        Stop the MDS daemon process.  If it held a rank, that rank
+        Return true if all daemons are in one of active, standby, standby-replay
        :return:
        """
        status = self.mon_manager.get_mds_status_all()
        for mds_id, mds_status in status['info'].items():
            if mds_status['state'] not in ["up:active", "up:standby", "up:standby-replay"]:
                log.warning("Unhealthy mds state {0}:{1}".format(mds_id, mds_status['state']))
                return False
        return True
    def wait_for_daemons(self, timeout=None):
        """
        Wait until all daemons are healthy
        :return:
        """
        if timeout is None:
            timeout = DAEMON_WAIT_TIMEOUT
        elapsed = 0
        while True:
            if self.are_daemons_healthy():
                return
            else:
                time.sleep(1)
                elapsed += 1
            if elapsed > timeout:
                raise RuntimeError("Timed out waiting for MDS daemons to become healthy")
    def get_lone_mds_id(self):
        if len(self.mds_ids) != 1:
            raise ValueError("Explicit MDS argument required when multiple MDSs in use")
        else:
            return self.mds_ids[0]
    def _one_or_all(self, mds_id, cb):
        """
        Call a callback for a single named MDS, or for all
        :param mds_id: MDS daemon name, or None
        :param cb: Callback taking single argument of MDS daemon name
        """
        if mds_id is None:
            with parallel() as p:
                for mds_id in self.mds_ids:
                    p.spawn(cb, mds_id)
        else:
            cb(mds_id)
    def mds_stop(self, mds_id=None):
        """
        Stop the MDS daemon process(se).  If it held a rank, that rank
        will eventually go laggy.
        """
-        mds = self._ctx.daemons.get_daemon('mds', self.mds_id)
+        self._one_or_all(mds_id, lambda id_: self.mds_daemons[id_].stop())
        mds.stop()
-    def mds_fail(self):
+    def mds_fail(self, mds_id=None):
        """
-        Inform MDSMonitor that the daemon process is dead.  If it held
+        Inform MDSMonitor of the death of the daemon process(es).  If it held
        a rank, that rank will be relinquished.
        """
-        self.mds_manager.raw_cluster_cmd("mds", "fail", "0")
+        self._one_or_all(mds_id, lambda id_: self.mon_manager.raw_cluster_cmd("mds", "fail", id_))
-    def mds_restart(self):
+    def mds_restart(self, mds_id=None):
-        mds = self._ctx.daemons.get_daemon('mds', self.mds_id)
+        self._one_or_all(mds_id, lambda id_: self.mds_daemons[id_].restart())
-        mds.restart()
+
    def mds_fail_restart(self, mds_id=None):
        """
        Variation on restart that includes marking MDSs as failed, so that doing this
        operation followed by waiting for healthy daemon states guarantees that they
        have gone down and come up, rather than potentially seeing the healthy states
        that existed before the restart.
        """
        def _fail_restart(id_):
            self.mds_daemons[id_].stop()
            self.mon_manager.raw_cluster_cmd("mds", "fail", id_)
            self.mds_daemons[id_].restart()
        self._one_or_all(mds_id, _fail_restart)
    def reset(self):
        log.info("Creating new filesystem")
-        assert not self._ctx.daemons.get_daemon('mds', self.mds_id).running()
+        self.mon_manager.raw_cluster_cmd_result('mds', 'set', "max_mds", "0")
-        self.mds_manager.raw_cluster_cmd_result('mds', 'set', "max_mds", "0")
+        for mds_id in self.mds_ids:
-        self.mds_manager.raw_cluster_cmd_result('mds', 'fail', self.mds_id)
+            assert not self._ctx.daemons.get_daemon('mds', mds_id).running()
-        self.mds_manager.raw_cluster_cmd_result('fs', 'rm', "default", "--yes-i-really-mean-it")
+            self.mon_manager.raw_cluster_cmd_result('mds', 'fail', mds_id)
-        self.mds_manager.raw_cluster_cmd_result('fs', 'new', "default", "metadata", "data")
+        self.mon_manager.raw_cluster_cmd_result('fs', 'rm', "default", "--yes-i-really-mean-it")
        self.mon_manager.raw_cluster_cmd_result('fs', 'new', "default", "metadata", "data")
    def get_metadata_object(self, object_type, object_id):
        """
@ -110,8 +175,10 @@ class Filesystem(object):
        return version
-    def mds_asok(self, command):
+    def mds_asok(self, command, mds_id=None):
-        proc = self.mds_manager.admin_socket('mds', self.mds_id, command)
+        if mds_id is None:
            mds_id = self.get_lone_mds_id()
        proc = self.mon_manager.admin_socket('mds', mds_id, command)
        response_data = proc.stdout.getvalue()
        log.info("mds_asok output: {0}".format(response_data))
        if response_data.strip():
@ -119,7 +186,7 @@ class Filesystem(object):
        else:
            return None
-    def wait_for_state(self, goal_state, reject=None, timeout=None):
+    def wait_for_state(self, goal_state, reject=None, timeout=None, mds_id=None):
        """
        Block until the MDS reaches a particular state, or a failure condition
        is met.
@ -130,10 +197,13 @@ class Filesystem(object):
        :return: number of seconds waited, rounded down to integer
        """
        if mds_id is None:
            mds_id = self.get_lone_mds_id()
        elapsed = 0
        while True:
            # mds_info is None if no daemon currently claims this rank
-            mds_info = self.mds_manager.get_mds_status(self.mds_id)
+            mds_info = self.mon_manager.get_mds_status(mds_id)
            current_state = mds_info['state'] if mds_info else None
            if current_state == goal_state:
--- a/teuthology/task/cephfs/kernel_mount.py
+++ b/teuthology/task/cephfs/kernel_mount.py
@ -80,3 +80,19 @@ class KernelMount(CephFSMount):
                mnt,
            ],
        )
    def cleanup(self):
        pass
    def umount_wait(self):
        pass
    def is_mounted(self):
        return True
    def wait_until_mounted(self):
        pass
    def teardown(self):
        super(KernelMount, self).teardown()
        self.umount()
--- a/teuthology/task/cephfs/mount.py
+++ b/teuthology/task/cephfs/mount.py
@ -1,4 +1,4 @@
-
+from contextlib import contextmanager
 import logging
 import datetime
 from textwrap import dedent
@ -47,6 +47,22 @@ class CephFSMount(object):
    def cleanup(self):
        raise NotImplementedError()
    def wait_until_mounted(self):
        raise NotImplementedError()
    @contextmanager
    def mounted(self):
        """
        A context manager, from an initially unmounted state, to mount
        this, yield, and then unmount and clean up.
        """
        self.mount()
        self.wait_until_mounted()
        try:
            yield
        finally:
            self.umount_wait()
    def create_files(self):
        assert(self.is_mounted())
--- a/teuthology/task/mds_journal_migration.py
+++ b/teuthology/task/mds_journal_migration.py
@ -4,7 +4,6 @@ import logging
 from teuthology import misc
 from teuthology.task.ceph import write_conf
 from teuthology.task.ceph_fuse import task as ceph_fuse_ctx
 from teuthology.task.cephfs.filesystem import Filesystem
 log = logging.getLogger(__name__)
@ -28,6 +27,13 @@ def task(ctx, config):
        client: client.0
    """
    if not hasattr(ctx, 'ceph'):
        raise RuntimeError("This task must be nested in 'ceph' task")
    if not hasattr(ctx, 'mounts'):
        raise RuntimeError("This task must be nested inside 'kclient' or 'ceph_fuse' task")
    # Determine which client we will use
    if config and 'client' in config:
        # Use client specified in config
        client_role = config['client']
@ -43,17 +49,13 @@ def task(ctx, config):
            client_id = client_list[0]
        except IndexError:
            raise RuntimeError("This task requires at least one client")
        else:
            client_role = "client.{0}".format(client_id)
    fs = Filesystem(ctx, config)
    ctx.fs = fs
    old_journal_version = JOURNAL_FORMAT_LEGACY
    new_journal_version = JOURNAL_FORMAT_RESILIENT
    # Set config so that journal will be created in older format
    if not hasattr(ctx, 'ceph'):
        raise RuntimeError("This task must be nested in 'ceph' task")
    if 'mds' not in ctx.ceph.conf:
        ctx.ceph.conf['mds'] = {}
    ctx.ceph.conf['mds']['mds journal format'] = old_journal_version
@ -61,13 +63,15 @@ def task(ctx, config):
                     # used a different config path this won't work.
    # Create a filesystem using the older journal format.
    for mount in ctx.mounts.values():
        mount.umount_wait()
    fs.mds_stop()
    fs.reset()
    fs.mds_restart()
    # Do some client work so that the log is populated with something.
-    with ceph_fuse_ctx(ctx, [client_role]) as client_mounts:
+    mount = ctx.mounts[client_id]
-        mount = client_mounts[client_id]
+    with mount.mounted():
        mount.create_files()
        mount.check_files()  # sanity, this should always pass
@ -76,12 +80,15 @@ def task(ctx, config):
    write_conf(ctx)
    # Restart the MDS.
-    fs.mds_restart()
+    fs.mds_fail_restart()
    fs.wait_for_daemons()
    # This ensures that all daemons come up into a valid state
    fs.wait_for_daemons()
    # Check that files created in the initial client workload are still visible
    # in a client mount.
-    with ceph_fuse_ctx(ctx, [client_role]) as client_mounts:
+    with mount.mounted():
        mount = client_mounts[client_id]
        mount.check_files()
    # Verify that the journal really has been rewritten.
@ -91,4 +98,8 @@ def task(ctx, config):
            new_journal_version, journal_version()
        ))
    # Leave all MDSs and clients running for any child tasks
    for mount in ctx.mounts.values():
        mount.mount()
    yield
--- a/teuthology/task/workunit.py
+++ b/teuthology/task/workunit.py
@ -5,12 +5,15 @@ import logging
 import pipes
 import os
-from teuthology import misc as teuthology
+from teuthology import misc
 from teuthology.orchestra.run import CommandFailedError
 from teuthology.parallel import parallel
 from ..orchestra import run
 log = logging.getLogger(__name__)
 CLIENT_PREFIX = 'client.'
 def task(ctx, config):
    """
@ -63,7 +66,7 @@ def task(ctx, config):
        'configuration must contain a dictionary of clients'
    overrides = ctx.config.get('overrides', {})
-    teuthology.deep_merge(config, overrides.get('workunit', {}))
+    misc.deep_merge(config, overrides.get('workunit', {}))
    refspec = config.get('branch')
    if refspec is None:
@ -77,46 +80,42 @@ def task(ctx, config):
    log.info('Pulling workunits from ref %s', refspec)
-    created_dir_dict = {}
+    created_mountpoint = {}
    if config.get('env') is not None:
        assert isinstance(config['env'], dict), 'env must be a dictionary'
    clients = config['clients']
    # Create scratch dirs for any non-all workunits
    log.info('Making a separate scratch dir for every client...')
    for role in clients.iterkeys():
        assert isinstance(role, basestring)
        if role == "all":
            continue
        PREFIX = 'client.'
        assert role.startswith(PREFIX)
        created_mnt_dir = _make_scratch_dir(ctx, role, config.get('subdir'))
        created_dir_dict[role] = created_mnt_dir
-    all_spec = False #is there an all grouping?
+        assert role.startswith(CLIENT_PREFIX)
        created_mnt_dir = _make_scratch_dir(ctx, role, config.get('subdir'))
        created_mountpoint[role] = created_mnt_dir
    # Execute any non-all workunits
    with parallel() as p:
        for role, tests in clients.iteritems():
            if role != "all":
                p.spawn(_run_tests, ctx, refspec, role, tests,
                        config.get('env'), timeout=timeout)
            else:
                all_spec = True
-    if all_spec:
+    # Clean up dirs from any non-all workunits
    for role, created in created_mountpoint.items():
        _delete_dir(ctx, role, created)
    # Execute any 'all' workunits
    if 'all' in clients:
        all_tasks = clients["all"]
        _spawn_on_all_clients(ctx, refspec, all_tasks, config.get('env'),
                              config.get('subdir'), timeout=timeout)
    for role in clients.iterkeys():
        assert isinstance(role, basestring)
        if role == "all":
            continue
        PREFIX = 'client.'
        assert role.startswith(PREFIX)
        if created_dir_dict[role]:
            _delete_dir(ctx, role)
-
+def _delete_dir(ctx, role, created_mountpoint):
 def _delete_dir(ctx, role):
    """
    Delete file used by this role, and delete the directory that this
    role appeared in.
@ -124,37 +123,35 @@ def _delete_dir(ctx, role):
    :param ctx: Context
    :param role: "role.#" where # is used for the role id.
    """
-    PREFIX = 'client.'
+    testdir = misc.get_testdir(ctx)
-    testdir = teuthology.get_testdir(ctx)
+    id_ = role[len(CLIENT_PREFIX):]
    id_ = role[len(PREFIX):]
    (remote,) = ctx.cluster.only(role).remotes.iterkeys()
    mnt = os.path.join(testdir, 'mnt.{id}'.format(id=id_))
    # Is there any reason why this is not: join(mnt, role) ?
    client = os.path.join(mnt, 'client.{id}'.format(id=id_))
    try:
        remote.run(
            args=[
                'rm',
                '-rf',
                '--',
                client,
                ],
            )
        log.info("Deleted dir {dir}".format(dir=client))
    except Exception:
        log.exception("Caught an exception deleting dir {dir}".format(dir=client))
-    try:
+    # Remove the directory inside the mount where the workunit ran
    remote.run(
        args=[
            'rm',
            '-rf',
            '--',
            client,
        ],
    )
    log.info("Deleted dir {dir}".format(dir=mnt))
    # If the mount was an artificially created dir, delete that too
    if created_mountpoint:
        remote.run(
            args=[
                'rmdir',
                '--',
                mnt,
-                ],
+            ],
-            )
+        )
-        log.info("Deleted dir {dir}".format(dir=mnt))
+        log.info("Deleted artificial mount point {dir}".format(dir=client))
-    except Exception:
+
        log.exception("Caught an exception deleting dir {dir}".format(dir=mnt))
 def _make_scratch_dir(ctx, role, subdir):
    """
@ -165,13 +162,12 @@ def _make_scratch_dir(ctx, role, subdir):
    :param role: "role.#" where # is used for the role id.
    :param subdir: use this subdir (False if not used)
    """
-    retVal = False
+    created_mountpoint = False
-    PREFIX = 'client.'
+    id_ = role[len(CLIENT_PREFIX):]
    id_ = role[len(PREFIX):]
    log.debug("getting remote for {id} role {role_}".format(id=id_, role_=role))
    (remote,) = ctx.cluster.only(role).remotes.iterkeys()
    dir_owner = remote.user
-    mnt = os.path.join(teuthology.get_testdir(ctx), 'mnt.{id}'.format(id=id_))
+    mnt = os.path.join(misc.get_testdir(ctx), 'mnt.{id}'.format(id=id_))
    # if neither kclient nor ceph-fuse are required for a workunit,
    # mnt may not exist. Stat and create the directory if it doesn't.
    try:
@ -180,22 +176,24 @@ def _make_scratch_dir(ctx, role, subdir):
                'stat',
                '--',
                mnt,
-                ],
+            ],
-            )
+        )
        log.info('Did not need to create dir {dir}'.format(dir=mnt))
-    except Exception:
+    except CommandFailedError:
        remote.run(
            args=[
                'mkdir',
                '--',
                mnt,
-                ],
+            ],
-            )
+        )
        log.info('Created dir {dir}'.format(dir=mnt))
-        retVal = True
+        created_mountpoint = True
-    if not subdir: subdir = 'client.{id}'.format(id=id_)
+    if not subdir:
-    if retVal:
+        subdir = 'client.{id}'.format(id=id_)
    if created_mountpoint:
        remote.run(
            args=[
                'cd',
@ -205,8 +203,8 @@ def _make_scratch_dir(ctx, role, subdir):
                'mkdir',
                '--',
                subdir,
-                ],
+            ],
-            )
+        )
    else:
        remote.run(
            args=[
@ -224,10 +222,10 @@ def _make_scratch_dir(ctx, role, subdir):
                '--owner={user}'.format(user=dir_owner),
                '--',
                subdir,
-                ],
+            ],
-            )
+        )
-    return retVal
+    return created_mountpoint
 def _spawn_on_all_clients(ctx, refspec, tests, env, subdir, timeout=None):
@ -237,12 +235,14 @@ def _spawn_on_all_clients(ctx, refspec, tests, env, subdir, timeout=None):
    See run_tests() for parameter documentation.
    """
-    client_generator = teuthology.all_roles_of_type(ctx.cluster, 'client')
+    client_generator = misc.all_roles_of_type(ctx.cluster, 'client')
    client_remotes = list()
    created_mountpoint = {}
    for client in client_generator:
        (client_remote,) = ctx.cluster.only('client.{id}'.format(id=client)).remotes.iterkeys()
        client_remotes.append((client_remote, 'client.{id}'.format(id=client)))
-        _make_scratch_dir(ctx, "client.{id}".format(id=client), subdir)
+        created_mountpoint[client] = _make_scratch_dir(ctx, "client.{id}".format(id=client), subdir)
    for unit in tests:
        with parallel() as p:
@ -251,9 +251,9 @@ def _spawn_on_all_clients(ctx, refspec, tests, env, subdir, timeout=None):
                        timeout=timeout)
    # cleanup the generated client directories
-    client_generator = teuthology.all_roles_of_type(ctx.cluster, 'client')
+    client_generator = misc.all_roles_of_type(ctx.cluster, 'client')
    for client in client_generator:
-        _delete_dir(ctx, 'client.{id}'.format(id=client))
+        _delete_dir(ctx, 'client.{id}'.format(id=client), created_mountpoint[client])
 def _run_tests(ctx, refspec, role, tests, env, subdir=None, timeout=None):
@ -274,11 +274,10 @@ def _run_tests(ctx, refspec, role, tests, env, subdir=None, timeout=None):
                    hours, or 'd' for days. If '0' or anything that evaluates
                    to False is passed, the 'timeout' command is not used.
    """
-    testdir = teuthology.get_testdir(ctx)
+    testdir = misc.get_testdir(ctx)
    assert isinstance(role, basestring)
-    PREFIX = 'client.'
+    assert role.startswith(CLIENT_PREFIX)
-    assert role.startswith(PREFIX)
+    id_ = role[len(CLIENT_PREFIX):]
    id_ = role[len(PREFIX):]
    (remote,) = ctx.cluster.only(role).remotes.iterkeys()
    mnt = os.path.join(testdir, 'mnt.{id}'.format(id=id_))
    # subdir so we can remove and recreate this a lot without sudo
@ -309,12 +308,12 @@ def _run_tests(ctx, refspec, role, tests, env, subdir=None, timeout=None):
            run.Raw('&&'),
            'find', '-executable', '-type', 'f', '-printf', r'%P\0'.format(srcdir=srcdir),
            run.Raw('>{tdir}/workunits.list'.format(tdir=testdir)),
-            ],
+        ],
-        )
+    )
-    workunits = sorted(teuthology.get_file(
+    workunits = sorted(misc.get_file(
-                            remote,
+        remote,
-                            '{tdir}/workunits.list'.format(tdir=testdir)).split('\0'))
+        '{tdir}/workunits.list'.format(tdir=testdir)).split('\0'))
    assert workunits
    try:
@ -336,7 +335,7 @@ def _run_tests(ctx, refspec, role, tests, env, subdir=None, timeout=None):
                    run.Raw('CEPH_REF={ref}'.format(ref=refspec)),
                    run.Raw('TESTDIR="{tdir}"'.format(tdir=testdir)),
                    run.Raw('CEPH_ID="{id}"'.format(id=id_)),
-                    ]
+                ]
                if env is not None:
                    for var, val in env.iteritems():
                        quoted_val = pipes.quote(val)
@ -352,21 +351,21 @@ def _run_tests(ctx, refspec, role, tests, env, subdir=None, timeout=None):
                    '{srcdir}/{workunit}'.format(
                        srcdir=srcdir,
                        workunit=workunit,
-                        ),
+                    ),
-                    ])
+                ])
                remote.run(
                    logger=log.getChild(role),
                    args=args,
-                    )
+                )
                remote.run(
                    logger=log.getChild(role),
                    args=['sudo', 'rm', '-rf', '--', scratch_tmp],
-                    )
+                )
    finally:
-        log.info('Stopping %s on %s...', spec, role)
+        log.info('Stopping %s on %s...', tests, role)
        remote.run(
            logger=log.getChild(role),
            args=[
                'rm', '-rf', '--', '{tdir}/workunits.list'.format(tdir=testdir), srcdir,
-                ],
+            ],
-            )
+        )