2013-10-12 08:28:27 +00:00
|
|
|
"""
|
2013-11-21 19:56:41 +00:00
|
|
|
Ceph cluster task.
|
2013-10-12 08:28:27 +00:00
|
|
|
|
|
|
|
Handle the setup, starting, and clean-up of a Ceph cluster.
|
|
|
|
"""
|
2011-05-31 20:51:48 +00:00
|
|
|
from cStringIO import StringIO
|
|
|
|
|
2011-11-18 01:00:44 +00:00
|
|
|
import argparse
|
2011-06-03 21:47:44 +00:00
|
|
|
import contextlib
|
2011-05-31 20:51:48 +00:00
|
|
|
import logging
|
|
|
|
import os
|
2013-07-06 01:01:57 +00:00
|
|
|
import struct
|
2014-02-04 01:17:09 +00:00
|
|
|
import json
|
|
|
|
import time
|
2011-05-31 20:51:48 +00:00
|
|
|
|
2011-06-03 21:47:44 +00:00
|
|
|
from teuthology import misc as teuthology
|
2011-06-15 21:57:02 +00:00
|
|
|
from teuthology import contextutil
|
2011-09-13 21:53:02 +00:00
|
|
|
from ..orchestra import run
|
2013-03-21 23:14:54 +00:00
|
|
|
import ceph_client as cclient
|
2011-05-31 20:51:48 +00:00
|
|
|
|
2014-05-12 13:25:26 +00:00
|
|
|
DEFAULT_CONF_PATH = '/etc/ceph/ceph.conf'
|
|
|
|
|
2011-05-31 20:51:48 +00:00
|
|
|
log = logging.getLogger(__name__)
|
|
|
|
|
2011-09-14 23:31:58 +00:00
|
|
|
class DaemonState(object):
|
2013-10-12 08:28:27 +00:00
|
|
|
"""
|
|
|
|
Daemon State. A daemon exists for each instance of each role.
|
|
|
|
"""
|
2011-09-14 23:31:58 +00:00
|
|
|
def __init__(self, remote, role, id_, *command_args, **command_kwargs):
|
2013-10-12 08:28:27 +00:00
|
|
|
"""
|
|
|
|
Pass remote command information as parameters to remote site
|
|
|
|
|
|
|
|
:param remote: Remote site
|
|
|
|
:param role: Role (osd, rgw, mon, mds)
|
|
|
|
:param id_: Id within role (osd.1, osd.2, for eaxmple)
|
|
|
|
:param command_args: positional arguments (used in restart commands)
|
|
|
|
:param command_kwargs: keyword arguments (used in restart commands)
|
|
|
|
"""
|
2011-09-14 23:31:58 +00:00
|
|
|
self.remote = remote
|
|
|
|
self.command_args = command_args
|
|
|
|
self.command_kwargs = command_kwargs
|
|
|
|
self.role = role
|
|
|
|
self.id_ = id_
|
2012-02-02 17:29:03 +00:00
|
|
|
self.log = command_kwargs.get('logger', log)
|
2011-09-14 23:31:58 +00:00
|
|
|
self.proc = None
|
|
|
|
|
|
|
|
def stop(self):
|
2012-02-02 17:26:25 +00:00
|
|
|
"""
|
2013-10-12 08:28:27 +00:00
|
|
|
Stop this daemon instance.
|
|
|
|
|
2012-02-02 17:26:25 +00:00
|
|
|
Note: this can raise a run.CommandFailedError,
|
|
|
|
run.CommandCrashedError, or run.ConnectionLostError.
|
|
|
|
"""
|
|
|
|
if not self.running():
|
2012-02-02 17:29:03 +00:00
|
|
|
self.log.error('tried to stop a non-running daemon')
|
2012-02-02 17:26:25 +00:00
|
|
|
return
|
|
|
|
self.proc.stdin.close()
|
2012-02-02 17:29:03 +00:00
|
|
|
self.log.debug('waiting for process to exit')
|
2012-02-02 17:26:25 +00:00
|
|
|
run.wait([self.proc])
|
|
|
|
self.proc = None
|
2012-02-02 17:29:03 +00:00
|
|
|
self.log.info('Stopped')
|
2011-09-14 23:31:58 +00:00
|
|
|
|
2013-01-09 22:02:42 +00:00
|
|
|
def restart(self, *args, **kwargs):
|
2013-10-12 08:28:27 +00:00
|
|
|
"""
|
|
|
|
Restart with a new command passed in the arguments
|
|
|
|
|
|
|
|
:param args: positional arguments passed to remote.run
|
|
|
|
:param kwargs: keyword arguments passed to remote.run
|
2013-11-21 19:56:41 +00:00
|
|
|
"""
|
2012-02-02 17:29:03 +00:00
|
|
|
self.log.info('Restarting')
|
2012-02-02 17:26:25 +00:00
|
|
|
if self.proc is not None:
|
2012-02-02 17:29:03 +00:00
|
|
|
self.log.debug('stopping old one...')
|
2012-02-02 17:26:25 +00:00
|
|
|
self.stop()
|
2013-01-09 22:02:42 +00:00
|
|
|
cmd_args = list(self.command_args)
|
|
|
|
cmd_args.extend(args)
|
|
|
|
cmd_kwargs = self.command_kwargs
|
|
|
|
cmd_kwargs.update(kwargs)
|
|
|
|
self.proc = self.remote.run(*cmd_args, **cmd_kwargs)
|
2012-02-02 17:29:03 +00:00
|
|
|
self.log.info('Started')
|
2011-09-14 23:31:58 +00:00
|
|
|
|
2013-03-11 18:22:10 +00:00
|
|
|
def restart_with_args(self, extra_args):
|
2013-10-12 08:28:27 +00:00
|
|
|
"""
|
|
|
|
Restart, adding new paramaters to the current command.
|
2013-11-21 19:56:41 +00:00
|
|
|
|
2013-10-12 08:28:27 +00:00
|
|
|
:param extra_args: Extra keyword arguments to be added.
|
|
|
|
"""
|
2013-03-11 18:22:10 +00:00
|
|
|
self.log.info('Restarting')
|
|
|
|
if self.proc is not None:
|
|
|
|
self.log.debug('stopping old one...')
|
|
|
|
self.stop()
|
|
|
|
cmd_args = list(self.command_args)
|
|
|
|
# we only want to make a temporary mod of the args list
|
|
|
|
# so we shallow copy the dict, and deepcopy the args list
|
|
|
|
cmd_kwargs = self.command_kwargs.copy()
|
|
|
|
from copy import deepcopy
|
|
|
|
cmd_kwargs['args'] = deepcopy(self.command_kwargs['args'])
|
|
|
|
cmd_kwargs['args'].extend(extra_args)
|
|
|
|
self.proc = self.remote.run(*cmd_args, **cmd_kwargs)
|
|
|
|
self.log.info('Started')
|
|
|
|
|
2013-07-06 01:01:57 +00:00
|
|
|
def signal(self, sig):
|
2013-10-12 08:28:27 +00:00
|
|
|
"""
|
|
|
|
Send a signal to associated remote commnad
|
|
|
|
|
|
|
|
:param sig: signal to send
|
|
|
|
"""
|
2013-07-06 01:01:57 +00:00
|
|
|
self.proc.stdin.write(struct.pack('!b', sig))
|
|
|
|
self.log.info('Sent signal %d', sig)
|
2013-03-11 18:22:10 +00:00
|
|
|
|
2011-09-14 23:31:58 +00:00
|
|
|
def running(self):
|
2013-10-12 08:28:27 +00:00
|
|
|
"""
|
|
|
|
Are we running?
|
2013-11-21 19:56:41 +00:00
|
|
|
:return: True if remote run command value is set, False otherwise.
|
2013-10-12 08:28:27 +00:00
|
|
|
"""
|
2013-03-19 20:42:51 +00:00
|
|
|
return self.proc is not None
|
2011-09-14 23:31:58 +00:00
|
|
|
|
2013-01-23 02:13:19 +00:00
|
|
|
def reset(self):
|
2013-10-12 08:28:27 +00:00
|
|
|
"""
|
|
|
|
clear remote run command value.
|
|
|
|
"""
|
2013-01-23 02:13:19 +00:00
|
|
|
self.proc = None
|
|
|
|
|
2013-03-11 18:22:10 +00:00
|
|
|
def wait_for_exit(self):
|
2013-10-12 08:28:27 +00:00
|
|
|
"""
|
|
|
|
clear remote run command value after waiting for exit.
|
|
|
|
"""
|
2013-03-11 18:22:10 +00:00
|
|
|
if self.proc:
|
2014-02-24 16:00:37 +00:00
|
|
|
try:
|
|
|
|
run.wait([self.proc])
|
|
|
|
finally:
|
|
|
|
self.proc = None
|
2011-09-14 23:31:58 +00:00
|
|
|
|
|
|
|
class CephState(object):
|
2013-10-12 08:28:27 +00:00
|
|
|
"""
|
|
|
|
Collection of daemon state instances
|
|
|
|
"""
|
2011-09-14 23:31:58 +00:00
|
|
|
def __init__(self):
|
2013-10-12 08:28:27 +00:00
|
|
|
"""
|
|
|
|
self.daemons is a dictionary indexed by role. Each entry is a dictionary of
|
|
|
|
DaemonState values indexcd by an id parameter.
|
|
|
|
"""
|
2011-09-14 23:31:58 +00:00
|
|
|
self.daemons = {}
|
|
|
|
|
|
|
|
def add_daemon(self, remote, role, id_, *args, **kwargs):
|
2013-10-12 08:28:27 +00:00
|
|
|
"""
|
|
|
|
Add a daemon. If there already is a daemon for this id_ and role, stop that
|
|
|
|
daemon and. Restart the damon once the new value is set.
|
2013-11-21 19:56:41 +00:00
|
|
|
|
2013-10-12 08:28:27 +00:00
|
|
|
:param remote: Remote site
|
|
|
|
:param role: Role (osd, mds, mon, rgw, for example)
|
|
|
|
:param id_: Id (index into role dictionary)
|
|
|
|
:param args: Daemonstate positional parameters
|
|
|
|
:param kwargs: Daemonstate keyword parameters
|
|
|
|
"""
|
2011-09-14 23:31:58 +00:00
|
|
|
if role not in self.daemons:
|
|
|
|
self.daemons[role] = {}
|
|
|
|
if id_ in self.daemons[role]:
|
|
|
|
self.daemons[role][id_].stop()
|
|
|
|
self.daemons[role][id_] = None
|
|
|
|
self.daemons[role][id_] = DaemonState(remote, role, id_, *args, **kwargs)
|
|
|
|
self.daemons[role][id_].restart()
|
|
|
|
|
|
|
|
def get_daemon(self, role, id_):
|
2013-10-12 08:28:27 +00:00
|
|
|
"""
|
|
|
|
get the daemon associated with this id_ for this role.
|
|
|
|
|
|
|
|
:param role: Role (osd, mds, mon, rgw, for example)
|
|
|
|
:param id_: Id (index into role dictionary)
|
|
|
|
"""
|
2011-09-14 23:31:58 +00:00
|
|
|
if role not in self.daemons:
|
|
|
|
return None
|
|
|
|
return self.daemons[role].get(str(id_), None)
|
|
|
|
|
|
|
|
def iter_daemons_of_role(self, role):
|
2013-10-12 08:28:27 +00:00
|
|
|
"""
|
|
|
|
Iterate through all daemon instances for this role. Return dictionary of
|
|
|
|
daemon values.
|
|
|
|
|
|
|
|
:param role: Role (osd, mds, mon, rgw, for example)
|
|
|
|
"""
|
2011-09-14 23:31:58 +00:00
|
|
|
return self.daemons.get(role, {}).values()
|
2011-06-15 21:57:02 +00:00
|
|
|
|
2013-02-17 07:44:03 +00:00
|
|
|
|
2011-06-16 20:13:32 +00:00
|
|
|
@contextlib.contextmanager
|
|
|
|
def ceph_log(ctx, config):
|
2013-10-12 08:28:27 +00:00
|
|
|
"""
|
|
|
|
Create /var/log/ceph log directory that is open to everyone.
|
|
|
|
Add valgrind and profiling-logger directories.
|
|
|
|
|
|
|
|
:param ctx: Context
|
|
|
|
:param config: Configuration
|
|
|
|
"""
|
2013-02-21 21:13:51 +00:00
|
|
|
log.info('Making ceph log dir writeable by non-root...')
|
2013-02-21 19:05:30 +00:00
|
|
|
run.wait(
|
|
|
|
ctx.cluster.run(
|
|
|
|
args=[
|
|
|
|
'sudo',
|
2013-09-03 18:09:39 +00:00
|
|
|
'chmod',
|
|
|
|
'777',
|
2013-02-21 19:05:30 +00:00
|
|
|
'/var/log/ceph',
|
|
|
|
],
|
|
|
|
wait=False,
|
|
|
|
)
|
|
|
|
)
|
2013-06-27 00:48:03 +00:00
|
|
|
log.info('Disabling ceph logrotate...')
|
|
|
|
run.wait(
|
|
|
|
ctx.cluster.run(
|
|
|
|
args=[
|
|
|
|
'sudo',
|
|
|
|
'rm', '-f', '--',
|
|
|
|
'/etc/logrotate.d/ceph',
|
|
|
|
],
|
|
|
|
wait=False,
|
|
|
|
)
|
|
|
|
)
|
2013-02-17 07:44:03 +00:00
|
|
|
log.info('Creating extra log directories...')
|
2011-06-16 20:13:32 +00:00
|
|
|
run.wait(
|
|
|
|
ctx.cluster.run(
|
|
|
|
args=[
|
2013-02-17 07:44:03 +00:00
|
|
|
'sudo',
|
2011-06-16 20:13:32 +00:00
|
|
|
'install', '-d', '-m0755', '--',
|
2013-02-17 07:44:03 +00:00
|
|
|
'/var/log/ceph/valgrind',
|
|
|
|
'/var/log/ceph/profiling-logger',
|
2011-06-16 20:13:32 +00:00
|
|
|
],
|
|
|
|
wait=False,
|
|
|
|
)
|
|
|
|
)
|
|
|
|
|
|
|
|
try:
|
|
|
|
yield
|
|
|
|
|
2013-02-17 07:44:03 +00:00
|
|
|
finally:
|
|
|
|
pass
|
2011-06-16 20:13:32 +00:00
|
|
|
|
2013-09-06 19:22:29 +00:00
|
|
|
|
2011-06-15 21:57:02 +00:00
|
|
|
@contextlib.contextmanager
|
|
|
|
def ship_utilities(ctx, config):
|
2013-10-12 08:28:27 +00:00
|
|
|
"""
|
|
|
|
Write a copy of valgrind.supp to each of the remote sites. Set executables used
|
|
|
|
by Ceph in /usr/local/bin. When finished (upon exit of the teuthology run), remove
|
|
|
|
these files.
|
|
|
|
|
|
|
|
:param ctx: Context
|
|
|
|
:param config: Configuration
|
|
|
|
"""
|
2011-06-15 21:57:02 +00:00
|
|
|
assert config is None
|
2013-09-06 22:55:14 +00:00
|
|
|
testdir = teuthology.get_testdir(ctx)
|
|
|
|
filenames = []
|
|
|
|
|
|
|
|
log.info('Shipping valgrind.supp...')
|
|
|
|
with file(os.path.join(os.path.dirname(__file__), 'valgrind.supp'), 'rb') as f:
|
|
|
|
fn = os.path.join(testdir, 'valgrind.supp')
|
|
|
|
filenames.append(fn)
|
|
|
|
for rem in ctx.cluster.remotes.iterkeys():
|
|
|
|
teuthology.sudo_write_file(
|
|
|
|
remote=rem,
|
|
|
|
path=fn,
|
|
|
|
data=f,
|
|
|
|
)
|
|
|
|
f.seek(0)
|
|
|
|
|
|
|
|
FILES = ['daemon-helper', 'adjust-ulimits', 'kcon_most']
|
2013-09-19 21:06:40 +00:00
|
|
|
destdir = '/usr/bin'
|
2011-06-15 21:57:02 +00:00
|
|
|
for filename in FILES:
|
|
|
|
log.info('Shipping %r...', filename)
|
|
|
|
src = os.path.join(os.path.dirname(__file__), filename)
|
2013-09-06 19:22:29 +00:00
|
|
|
dst = os.path.join(destdir, filename)
|
2013-09-06 22:55:14 +00:00
|
|
|
filenames.append(dst)
|
2011-06-15 21:57:02 +00:00
|
|
|
with file(src, 'rb') as f:
|
|
|
|
for rem in ctx.cluster.remotes.iterkeys():
|
2013-09-06 19:22:29 +00:00
|
|
|
teuthology.sudo_write_file(
|
2011-06-15 21:57:02 +00:00
|
|
|
remote=rem,
|
|
|
|
path=dst,
|
|
|
|
data=f,
|
2013-09-06 19:22:29 +00:00
|
|
|
)
|
2011-06-15 21:57:02 +00:00
|
|
|
f.seek(0)
|
|
|
|
rem.run(
|
|
|
|
args=[
|
2013-09-06 19:22:29 +00:00
|
|
|
'sudo',
|
2011-06-15 21:57:02 +00:00
|
|
|
'chmod',
|
|
|
|
'a=rx',
|
|
|
|
'--',
|
|
|
|
dst,
|
2013-09-06 19:22:29 +00:00
|
|
|
],
|
|
|
|
)
|
2011-06-15 21:57:02 +00:00
|
|
|
|
|
|
|
try:
|
|
|
|
yield
|
|
|
|
finally:
|
2013-09-06 22:55:14 +00:00
|
|
|
log.info('Removing shipped files: %s...', ' '.join(filenames))
|
2011-06-15 21:57:02 +00:00
|
|
|
run.wait(
|
|
|
|
ctx.cluster.run(
|
|
|
|
args=[
|
2013-09-06 19:22:29 +00:00
|
|
|
'sudo',
|
2011-06-15 21:57:02 +00:00
|
|
|
'rm',
|
2013-09-06 19:22:29 +00:00
|
|
|
'-f',
|
2011-06-15 21:57:02 +00:00
|
|
|
'--',
|
2013-09-06 19:22:29 +00:00
|
|
|
] + list(filenames),
|
2011-06-15 21:57:02 +00:00
|
|
|
wait=False,
|
2013-09-06 19:22:29 +00:00
|
|
|
),
|
|
|
|
)
|
|
|
|
|
2011-06-15 21:57:02 +00:00
|
|
|
|
2011-10-03 21:03:36 +00:00
|
|
|
def assign_devs(roles, devs):
|
2013-10-12 08:28:27 +00:00
|
|
|
"""
|
|
|
|
Create a dictionary of devs indexed by roles
|
|
|
|
|
2013-11-21 19:56:41 +00:00
|
|
|
:param roles: List of roles
|
2013-10-12 08:28:27 +00:00
|
|
|
:param devs: Corresponding list of devices.
|
|
|
|
:returns: Dictionary of devs indexed by roles.
|
|
|
|
"""
|
2011-10-03 21:03:36 +00:00
|
|
|
return dict(zip(roles, devs))
|
|
|
|
|
2011-08-29 20:58:09 +00:00
|
|
|
@contextlib.contextmanager
|
|
|
|
def valgrind_post(ctx, config):
|
2013-10-12 08:28:27 +00:00
|
|
|
"""
|
|
|
|
After the tests run, look throught all the valgrind logs. Exceptions are raised
|
|
|
|
if textual errors occured in the logs, or if valgrind exceptions were detected in
|
|
|
|
the logs.
|
|
|
|
|
|
|
|
:param ctx: Context
|
|
|
|
:param config: Configuration
|
|
|
|
"""
|
2011-08-29 20:58:09 +00:00
|
|
|
try:
|
|
|
|
yield
|
|
|
|
finally:
|
2012-02-22 00:10:37 +00:00
|
|
|
lookup_procs = list()
|
|
|
|
log.info('Checking for errors in any valgrind logs...');
|
|
|
|
for remote in ctx.cluster.remotes.iterkeys():
|
|
|
|
#look at valgrind logs for each node
|
|
|
|
proc = remote.run(
|
|
|
|
args=[
|
2013-02-19 00:43:04 +00:00
|
|
|
'sudo',
|
2013-05-22 20:22:21 +00:00
|
|
|
'zgrep',
|
|
|
|
'<kind>',
|
|
|
|
run.Raw('/var/log/ceph/valgrind/*'),
|
2013-06-04 16:07:53 +00:00
|
|
|
'/dev/null', # include a second file so that we always get a filename prefix on the output
|
2012-11-18 00:19:14 +00:00
|
|
|
run.Raw('|'),
|
|
|
|
'sort',
|
|
|
|
run.Raw('|'),
|
|
|
|
'uniq',
|
|
|
|
],
|
2013-05-22 20:22:21 +00:00
|
|
|
wait=False,
|
2012-11-18 00:19:14 +00:00
|
|
|
check_status=False,
|
|
|
|
stdout=StringIO(),
|
2012-02-22 00:10:37 +00:00
|
|
|
)
|
|
|
|
lookup_procs.append((proc, remote))
|
2011-09-02 18:07:10 +00:00
|
|
|
|
2012-02-22 00:10:37 +00:00
|
|
|
valgrind_exception = None
|
|
|
|
for (proc, remote) in lookup_procs:
|
2013-05-22 20:22:21 +00:00
|
|
|
proc.exitstatus.get()
|
2012-11-18 00:19:14 +00:00
|
|
|
out = proc.stdout.getvalue()
|
|
|
|
for line in out.split('\n'):
|
|
|
|
if line == '':
|
|
|
|
continue
|
2013-06-03 16:57:17 +00:00
|
|
|
try:
|
|
|
|
(file, kind) = line.split(':')
|
2013-08-30 15:58:10 +00:00
|
|
|
except Exception:
|
2013-06-03 16:57:17 +00:00
|
|
|
log.error('failed to split line %s', line)
|
|
|
|
raise
|
2012-11-18 00:19:14 +00:00
|
|
|
log.debug('file %s kind %s', file, kind)
|
2013-08-02 18:57:14 +00:00
|
|
|
if (file.find('mds') >= 0) and kind.find('Lost') > 0:
|
2012-11-18 00:19:14 +00:00
|
|
|
continue
|
|
|
|
log.error('saw valgrind issue %s in %s', kind, file)
|
|
|
|
valgrind_exception = Exception('saw valgrind issues')
|
2011-09-02 18:07:10 +00:00
|
|
|
|
2012-02-22 00:10:37 +00:00
|
|
|
if valgrind_exception is not None:
|
|
|
|
raise valgrind_exception
|
2011-08-29 20:58:09 +00:00
|
|
|
|
2013-01-23 02:13:19 +00:00
|
|
|
|
|
|
|
def mount_osd_data(ctx, remote, osd):
|
2013-10-12 08:28:27 +00:00
|
|
|
"""
|
|
|
|
Mount a remote OSD
|
2013-11-21 19:56:41 +00:00
|
|
|
|
2013-10-12 08:28:27 +00:00
|
|
|
:param ctx: Context
|
|
|
|
:param remote: Remote site
|
|
|
|
:param ods: Osd name
|
|
|
|
"""
|
2013-01-23 02:27:41 +00:00
|
|
|
log.debug('Mounting data for osd.{o} on {r}'.format(o=osd, r=remote))
|
|
|
|
if remote in ctx.disk_config.remote_to_roles_to_dev and osd in ctx.disk_config.remote_to_roles_to_dev[remote]:
|
|
|
|
dev = ctx.disk_config.remote_to_roles_to_dev[remote][osd]
|
|
|
|
mount_options = ctx.disk_config.remote_to_roles_to_dev_mount_options[remote][osd]
|
|
|
|
fstype = ctx.disk_config.remote_to_roles_to_dev_fstype[remote][osd]
|
2013-02-17 06:32:16 +00:00
|
|
|
mnt = os.path.join('/var/lib/ceph/osd', 'ceph-{id}'.format(id=osd))
|
2013-01-23 02:27:41 +00:00
|
|
|
|
|
|
|
log.info('Mounting osd.{o}: dev: {n}, mountpoint: {p}, type: {t}, options: {v}'.format(
|
|
|
|
o=osd, n=remote.name, p=mnt, t=fstype, v=mount_options))
|
|
|
|
|
|
|
|
remote.run(
|
2013-01-23 02:13:19 +00:00
|
|
|
args=[
|
|
|
|
'sudo',
|
|
|
|
'mount',
|
|
|
|
'-t', fstype,
|
|
|
|
'-o', ','.join(mount_options),
|
|
|
|
dev,
|
2013-01-23 02:27:41 +00:00
|
|
|
mnt,
|
2013-01-23 02:13:19 +00:00
|
|
|
]
|
|
|
|
)
|
|
|
|
|
2013-03-13 03:17:16 +00:00
|
|
|
def make_admin_daemon_dir(ctx, remote):
|
2013-10-12 08:28:27 +00:00
|
|
|
"""
|
|
|
|
Create /var/run/ceph directory on remote site.
|
|
|
|
|
|
|
|
:param ctx: Context
|
|
|
|
:param remote: Remote site
|
|
|
|
"""
|
2013-03-13 03:17:16 +00:00
|
|
|
remote.run(
|
|
|
|
args=[
|
|
|
|
'sudo',
|
|
|
|
'install', '-d', '-m0777', '--', '/var/run/ceph',
|
|
|
|
],
|
|
|
|
)
|
|
|
|
|
2014-05-12 13:25:26 +00:00
|
|
|
|
|
|
|
def write_conf(ctx, conf_path=DEFAULT_CONF_PATH):
|
|
|
|
conf_fp = StringIO()
|
|
|
|
ctx.ceph.conf.write(conf_fp)
|
|
|
|
conf_fp.seek(0)
|
|
|
|
writes = ctx.cluster.run(
|
|
|
|
args=[
|
|
|
|
'sudo', 'mkdir', '-p', '/etc/ceph', run.Raw('&&'),
|
|
|
|
'sudo', 'chmod', '0755', '/etc/ceph', run.Raw('&&'),
|
|
|
|
'sudo', 'python',
|
|
|
|
'-c',
|
|
|
|
'import shutil, sys; shutil.copyfileobj(sys.stdin, file(sys.argv[1], "wb"))',
|
|
|
|
conf_path,
|
|
|
|
run.Raw('&&'),
|
|
|
|
'sudo', 'chmod', '0644', conf_path,
|
|
|
|
],
|
|
|
|
stdin=run.PIPE,
|
|
|
|
wait=False)
|
|
|
|
log.warn("writes: ")
|
|
|
|
teuthology.feed_many_stdins_and_close(conf_fp, writes)
|
|
|
|
run.wait(writes)
|
|
|
|
|
|
|
|
|
2011-06-16 17:36:15 +00:00
|
|
|
@contextlib.contextmanager
|
|
|
|
def cluster(ctx, config):
|
2013-10-12 08:28:27 +00:00
|
|
|
"""
|
|
|
|
Handle the creation and removal of a ceph cluster.
|
|
|
|
|
|
|
|
On startup:
|
|
|
|
Create directories needed for the cluster.
|
|
|
|
Create remote journals for all osds.
|
|
|
|
Create and set keyring.
|
|
|
|
Copy the monmap to tht test systems.
|
|
|
|
Setup mon nodes.
|
|
|
|
Setup mds nodes.
|
|
|
|
Mkfs osd nodes.
|
|
|
|
Add keyring information to monmaps
|
|
|
|
Mkfs mon nodes.
|
2013-11-21 19:56:41 +00:00
|
|
|
|
2013-10-12 08:28:27 +00:00
|
|
|
On exit:
|
|
|
|
If errors occured, extract a failure message and store in ctx.summary.
|
|
|
|
Unmount all test files and temporary journaling files.
|
|
|
|
Save the monitor information and archive all ceph logs.
|
|
|
|
Cleanup the keyring setup, and remove all monitor map and data files left over.
|
|
|
|
|
|
|
|
:param ctx: Context
|
|
|
|
:param config: Configuration
|
|
|
|
"""
|
2013-11-21 19:56:41 +00:00
|
|
|
if ctx.config.get('use_existing_cluster', False) is True:
|
|
|
|
log.info("'use_existing_cluster' is true; skipping cluster creation")
|
|
|
|
yield
|
|
|
|
|
2013-01-23 20:37:39 +00:00
|
|
|
testdir = teuthology.get_testdir(ctx)
|
2011-06-16 17:36:15 +00:00
|
|
|
log.info('Creating ceph cluster...')
|
2011-06-16 21:05:13 +00:00
|
|
|
run.wait(
|
|
|
|
ctx.cluster.run(
|
|
|
|
args=[
|
|
|
|
'install', '-d', '-m0755', '--',
|
2013-01-23 20:37:39 +00:00
|
|
|
'{tdir}/data'.format(tdir=testdir),
|
2011-06-16 21:05:13 +00:00
|
|
|
],
|
|
|
|
wait=False,
|
|
|
|
)
|
|
|
|
)
|
2011-06-16 17:36:15 +00:00
|
|
|
|
2013-02-18 17:41:00 +00:00
|
|
|
run.wait(
|
|
|
|
ctx.cluster.run(
|
|
|
|
args=[
|
|
|
|
'sudo',
|
2013-02-21 21:13:51 +00:00
|
|
|
'install', '-d', '-m0777', '--', '/var/run/ceph',
|
2013-02-18 17:41:00 +00:00
|
|
|
],
|
|
|
|
wait=False,
|
|
|
|
)
|
|
|
|
)
|
|
|
|
|
2012-03-27 22:05:11 +00:00
|
|
|
|
|
|
|
devs_to_clean = {}
|
|
|
|
remote_to_roles_to_devs = {}
|
|
|
|
remote_to_roles_to_journals = {}
|
|
|
|
osds = ctx.cluster.only(teuthology.is_type('osd'))
|
|
|
|
for remote, roles_for_host in osds.remotes.iteritems():
|
|
|
|
devs = teuthology.get_scratch_devices(remote)
|
|
|
|
roles_to_devs = {}
|
|
|
|
roles_to_journals = {}
|
|
|
|
if config.get('fs'):
|
2013-01-23 02:13:19 +00:00
|
|
|
log.info('fs option selected, checking for scratch devs')
|
2012-03-27 22:05:11 +00:00
|
|
|
log.info('found devs: %s' % (str(devs),))
|
2013-01-23 02:27:41 +00:00
|
|
|
devs_id_map = teuthology.get_wwn_id_map(remote, devs)
|
2013-02-01 16:16:44 +00:00
|
|
|
iddevs = devs_id_map.values()
|
2012-03-27 22:05:11 +00:00
|
|
|
roles_to_devs = assign_devs(
|
2013-01-23 02:27:41 +00:00
|
|
|
teuthology.roles_of_type(roles_for_host, 'osd'), iddevs
|
2012-03-27 22:05:11 +00:00
|
|
|
)
|
2013-01-23 02:27:41 +00:00
|
|
|
if len(roles_to_devs) < len(iddevs):
|
|
|
|
iddevs = iddevs[len(roles_to_devs):]
|
2012-03-27 22:05:11 +00:00
|
|
|
devs_to_clean[remote] = []
|
2013-01-23 02:13:19 +00:00
|
|
|
|
2012-03-27 22:05:11 +00:00
|
|
|
if config.get('block_journal'):
|
|
|
|
log.info('block journal enabled')
|
|
|
|
roles_to_journals = assign_devs(
|
2013-01-23 02:27:41 +00:00
|
|
|
teuthology.roles_of_type(roles_for_host, 'osd'), iddevs
|
2012-03-27 22:05:11 +00:00
|
|
|
)
|
|
|
|
log.info('journal map: %s', roles_to_journals)
|
2012-08-16 22:50:10 +00:00
|
|
|
|
|
|
|
if config.get('tmpfs_journal'):
|
|
|
|
log.info('tmpfs journal enabled')
|
|
|
|
roles_to_journals = {}
|
|
|
|
remote.run( args=[ 'sudo', 'mount', '-t', 'tmpfs', 'tmpfs', '/mnt' ] )
|
|
|
|
for osd in teuthology.roles_of_type(roles_for_host, 'osd'):
|
|
|
|
tmpfs = '/mnt/osd.%s' % osd
|
|
|
|
roles_to_journals[osd] = tmpfs
|
|
|
|
remote.run( args=[ 'truncate', '-s', '1500M', tmpfs ] )
|
|
|
|
log.info('journal map: %s', roles_to_journals)
|
|
|
|
|
2013-01-23 02:27:41 +00:00
|
|
|
log.info('dev map: %s' % (str(roles_to_devs),))
|
2012-03-27 22:05:11 +00:00
|
|
|
remote_to_roles_to_devs[remote] = roles_to_devs
|
|
|
|
remote_to_roles_to_journals[remote] = roles_to_journals
|
|
|
|
|
|
|
|
|
2011-07-05 23:45:32 +00:00
|
|
|
log.info('Generating config...')
|
2011-06-16 17:36:15 +00:00
|
|
|
remotes_and_roles = ctx.cluster.remotes.items()
|
2013-02-21 22:47:00 +00:00
|
|
|
roles = [role_list for (remote, role_list) in remotes_and_roles]
|
|
|
|
ips = [host for (host, port) in (remote.ssh.get_transport().getpeername() for (remote, role_list) in remotes_and_roles)]
|
2013-01-23 20:37:39 +00:00
|
|
|
conf = teuthology.skeleton_config(ctx, roles=roles, ips=ips)
|
2012-03-27 22:05:11 +00:00
|
|
|
for remote, roles_to_journals in remote_to_roles_to_journals.iteritems():
|
|
|
|
for role, journal in roles_to_journals.iteritems():
|
|
|
|
key = "osd." + str(role)
|
|
|
|
if key not in conf:
|
|
|
|
conf[key] = {}
|
|
|
|
conf[key]['osd journal'] = journal
|
2011-07-05 23:45:32 +00:00
|
|
|
for section, keys in config['conf'].iteritems():
|
|
|
|
for key, value in keys.iteritems():
|
|
|
|
log.info("[%s] %s = %s" % (section, key, value))
|
|
|
|
if section not in conf:
|
|
|
|
conf[section] = {}
|
|
|
|
conf[section][key] = value
|
2011-07-12 01:00:03 +00:00
|
|
|
|
2012-08-16 22:50:10 +00:00
|
|
|
if config.get('tmpfs_journal'):
|
|
|
|
conf['journal dio'] = False
|
|
|
|
|
2011-11-18 01:00:44 +00:00
|
|
|
ctx.ceph = argparse.Namespace()
|
|
|
|
ctx.ceph.conf = conf
|
2011-11-09 06:00:32 +00:00
|
|
|
|
2013-02-06 19:16:52 +00:00
|
|
|
keyring_path = config.get('keyring_path', '/etc/ceph/ceph.keyring')
|
|
|
|
|
2013-01-23 20:37:39 +00:00
|
|
|
coverage_dir = '{tdir}/archive/coverage'.format(tdir=testdir)
|
2011-06-16 17:36:15 +00:00
|
|
|
|
2011-08-31 20:56:42 +00:00
|
|
|
firstmon = teuthology.get_first_mon(ctx, config)
|
2011-07-27 04:46:47 +00:00
|
|
|
|
|
|
|
log.info('Setting up %s...' % firstmon)
|
|
|
|
ctx.cluster.only(firstmon).run(
|
2011-06-16 17:36:15 +00:00
|
|
|
args=[
|
2013-02-06 19:16:52 +00:00
|
|
|
'sudo',
|
2013-09-06 19:22:29 +00:00
|
|
|
'adjust-ulimits',
|
2013-02-06 19:16:52 +00:00
|
|
|
'ceph-coverage',
|
2011-06-16 17:36:15 +00:00
|
|
|
coverage_dir,
|
2013-02-06 19:16:52 +00:00
|
|
|
'ceph-authtool',
|
2011-06-16 17:36:15 +00:00
|
|
|
'--create-keyring',
|
2013-02-06 19:16:52 +00:00
|
|
|
keyring_path,
|
2011-06-16 17:36:15 +00:00
|
|
|
],
|
|
|
|
)
|
2011-07-27 04:46:47 +00:00
|
|
|
ctx.cluster.only(firstmon).run(
|
2011-06-16 17:36:15 +00:00
|
|
|
args=[
|
2013-02-06 19:16:52 +00:00
|
|
|
'sudo',
|
2013-09-06 19:22:29 +00:00
|
|
|
'adjust-ulimits',
|
2013-02-06 19:16:52 +00:00
|
|
|
'ceph-coverage',
|
2011-06-16 17:36:15 +00:00
|
|
|
coverage_dir,
|
2013-02-06 19:16:52 +00:00
|
|
|
'ceph-authtool',
|
2011-06-16 17:36:15 +00:00
|
|
|
'--gen-key',
|
|
|
|
'--name=mon.',
|
2013-02-06 19:16:52 +00:00
|
|
|
keyring_path,
|
|
|
|
],
|
|
|
|
)
|
|
|
|
ctx.cluster.only(firstmon).run(
|
|
|
|
args=[
|
|
|
|
'sudo',
|
|
|
|
'chmod',
|
|
|
|
'0644',
|
|
|
|
keyring_path,
|
2011-06-16 17:36:15 +00:00
|
|
|
],
|
|
|
|
)
|
2014-03-27 16:35:28 +00:00
|
|
|
(mon0_remote,) = ctx.cluster.only(firstmon).remotes.keys()
|
2013-12-12 21:33:19 +00:00
|
|
|
fsid = teuthology.create_simple_monmap(
|
2013-01-23 20:37:39 +00:00
|
|
|
ctx,
|
2011-06-16 17:36:15 +00:00
|
|
|
remote=mon0_remote,
|
|
|
|
conf=conf,
|
|
|
|
)
|
2013-12-12 21:33:19 +00:00
|
|
|
if not 'global' in conf:
|
|
|
|
conf['global'] = {}
|
|
|
|
conf['global']['fsid'] = fsid
|
|
|
|
|
|
|
|
log.info('Writing ceph.conf for FSID %s...' % fsid)
|
2014-05-12 13:25:26 +00:00
|
|
|
conf_path = config.get('conf_path', DEFAULT_CONF_PATH)
|
|
|
|
write_conf(ctx, conf_path)
|
2011-06-16 17:36:15 +00:00
|
|
|
|
2011-07-27 04:46:47 +00:00
|
|
|
log.info('Creating admin key on %s...' % firstmon)
|
|
|
|
ctx.cluster.only(firstmon).run(
|
2011-06-16 17:36:15 +00:00
|
|
|
args=[
|
2013-02-06 19:16:52 +00:00
|
|
|
'sudo',
|
2013-09-06 19:22:29 +00:00
|
|
|
'adjust-ulimits',
|
2013-02-06 19:16:52 +00:00
|
|
|
'ceph-coverage',
|
2011-06-16 17:36:15 +00:00
|
|
|
coverage_dir,
|
2013-02-06 19:16:52 +00:00
|
|
|
'ceph-authtool',
|
2011-06-16 17:36:15 +00:00
|
|
|
'--gen-key',
|
|
|
|
'--name=client.admin',
|
|
|
|
'--set-uid=0',
|
|
|
|
'--cap', 'mon', 'allow *',
|
|
|
|
'--cap', 'osd', 'allow *',
|
|
|
|
'--cap', 'mds', 'allow',
|
2013-02-06 19:16:52 +00:00
|
|
|
keyring_path,
|
2011-06-16 17:36:15 +00:00
|
|
|
],
|
|
|
|
)
|
|
|
|
|
2011-07-07 22:40:37 +00:00
|
|
|
log.info('Copying monmap to all nodes...')
|
2011-06-16 17:36:15 +00:00
|
|
|
keyring = teuthology.get_file(
|
|
|
|
remote=mon0_remote,
|
2013-02-06 19:16:52 +00:00
|
|
|
path=keyring_path,
|
2011-06-16 17:36:15 +00:00
|
|
|
)
|
|
|
|
monmap = teuthology.get_file(
|
|
|
|
remote=mon0_remote,
|
2013-01-23 20:37:39 +00:00
|
|
|
path='{tdir}/monmap'.format(tdir=testdir),
|
2011-06-16 17:36:15 +00:00
|
|
|
)
|
|
|
|
|
2011-07-07 22:40:37 +00:00
|
|
|
for rem in ctx.cluster.remotes.iterkeys():
|
2011-06-16 17:36:15 +00:00
|
|
|
# copy mon key and initial monmap
|
2011-07-07 22:40:37 +00:00
|
|
|
log.info('Sending monmap to node {remote}'.format(remote=rem))
|
2013-02-06 19:16:52 +00:00
|
|
|
teuthology.sudo_write_file(
|
2011-06-16 17:36:15 +00:00
|
|
|
remote=rem,
|
2013-02-06 19:16:52 +00:00
|
|
|
path=keyring_path,
|
2011-06-16 17:36:15 +00:00
|
|
|
data=keyring,
|
2013-02-06 19:16:52 +00:00
|
|
|
perms='0644'
|
2011-06-16 17:36:15 +00:00
|
|
|
)
|
|
|
|
teuthology.write_file(
|
|
|
|
remote=rem,
|
2013-01-23 20:37:39 +00:00
|
|
|
path='{tdir}/monmap'.format(tdir=testdir),
|
2011-06-16 17:36:15 +00:00
|
|
|
data=monmap,
|
|
|
|
)
|
|
|
|
|
|
|
|
log.info('Setting up mon nodes...')
|
2011-07-07 22:40:37 +00:00
|
|
|
mons = ctx.cluster.only(teuthology.is_type('mon'))
|
2011-06-16 17:36:15 +00:00
|
|
|
run.wait(
|
|
|
|
mons.run(
|
|
|
|
args=[
|
2013-09-06 19:22:29 +00:00
|
|
|
'adjust-ulimits',
|
2013-02-06 19:16:52 +00:00
|
|
|
'ceph-coverage',
|
2011-06-16 17:36:15 +00:00
|
|
|
coverage_dir,
|
2013-02-06 19:16:52 +00:00
|
|
|
'osdmaptool',
|
|
|
|
'-c', conf_path,
|
2011-06-16 17:36:15 +00:00
|
|
|
'--clobber',
|
|
|
|
'--createsimple', '{num:d}'.format(
|
|
|
|
num=teuthology.num_instances_of_type(ctx.cluster, 'osd'),
|
|
|
|
),
|
2013-01-23 20:37:39 +00:00
|
|
|
'{tdir}/osdmap'.format(tdir=testdir),
|
2011-06-16 17:36:15 +00:00
|
|
|
'--pg_bits', '2',
|
|
|
|
'--pgp_bits', '4',
|
|
|
|
],
|
|
|
|
wait=False,
|
|
|
|
),
|
|
|
|
)
|
|
|
|
|
|
|
|
log.info('Setting up mds nodes...')
|
|
|
|
mdss = ctx.cluster.only(teuthology.is_type('mds'))
|
|
|
|
for remote, roles_for_host in mdss.remotes.iteritems():
|
|
|
|
for id_ in teuthology.roles_of_type(roles_for_host, 'mds'):
|
|
|
|
remote.run(
|
|
|
|
args=[
|
2013-02-17 06:32:16 +00:00
|
|
|
'sudo',
|
|
|
|
'mkdir',
|
|
|
|
'-p',
|
|
|
|
'/var/lib/ceph/mds/ceph-{id}'.format(id=id_),
|
|
|
|
run.Raw('&&'),
|
2013-09-05 21:41:27 +00:00
|
|
|
'sudo',
|
2013-09-06 19:22:29 +00:00
|
|
|
'adjust-ulimits',
|
2013-02-06 19:16:52 +00:00
|
|
|
'ceph-coverage',
|
2011-06-16 17:36:15 +00:00
|
|
|
coverage_dir,
|
2013-02-06 19:16:52 +00:00
|
|
|
'ceph-authtool',
|
2011-06-16 17:36:15 +00:00
|
|
|
'--create-keyring',
|
|
|
|
'--gen-key',
|
|
|
|
'--name=mds.{id}'.format(id=id_),
|
2013-02-17 06:32:16 +00:00
|
|
|
'/var/lib/ceph/mds/ceph-{id}/keyring'.format(id=id_),
|
2011-06-16 17:36:15 +00:00
|
|
|
],
|
|
|
|
)
|
|
|
|
|
2013-03-21 23:14:54 +00:00
|
|
|
cclient.create_keyring(ctx)
|
2011-06-16 17:36:15 +00:00
|
|
|
log.info('Running mkfs on osd nodes...')
|
2013-05-01 20:14:35 +00:00
|
|
|
|
|
|
|
ctx.disk_config = argparse.Namespace()
|
|
|
|
ctx.disk_config.remote_to_roles_to_dev = remote_to_roles_to_devs
|
|
|
|
ctx.disk_config.remote_to_roles_to_journals = remote_to_roles_to_journals
|
|
|
|
ctx.disk_config.remote_to_roles_to_dev_mount_options = {}
|
|
|
|
ctx.disk_config.remote_to_roles_to_dev_fstype = {}
|
|
|
|
|
|
|
|
log.info("ctx.disk_config.remote_to_roles_to_dev: {r}".format(r=str(ctx.disk_config.remote_to_roles_to_dev)))
|
2011-06-16 17:36:15 +00:00
|
|
|
for remote, roles_for_host in osds.remotes.iteritems():
|
2012-03-27 22:05:11 +00:00
|
|
|
roles_to_devs = remote_to_roles_to_devs[remote]
|
|
|
|
roles_to_journals = remote_to_roles_to_journals[remote]
|
2013-02-01 17:37:13 +00:00
|
|
|
|
2012-06-28 00:38:12 +00:00
|
|
|
|
2011-06-16 17:36:15 +00:00
|
|
|
for id_ in teuthology.roles_of_type(roles_for_host, 'osd'):
|
|
|
|
remote.run(
|
|
|
|
args=[
|
2013-02-17 06:32:16 +00:00
|
|
|
'sudo',
|
2011-06-16 17:36:15 +00:00
|
|
|
'mkdir',
|
2013-02-17 06:32:16 +00:00
|
|
|
'-p',
|
|
|
|
'/var/lib/ceph/osd/ceph-{id}'.format(id=id_),
|
|
|
|
])
|
|
|
|
log.info(str(roles_to_journals))
|
|
|
|
log.info(id_)
|
2011-10-03 21:03:36 +00:00
|
|
|
if roles_to_devs.get(id_):
|
|
|
|
dev = roles_to_devs[id_]
|
2012-02-11 22:24:39 +00:00
|
|
|
fs = config.get('fs')
|
|
|
|
package = None
|
2012-07-13 18:30:07 +00:00
|
|
|
mkfs_options = config.get('mkfs_options')
|
|
|
|
mount_options = config.get('mount_options')
|
2012-02-11 22:24:39 +00:00
|
|
|
if fs == 'btrfs':
|
2013-07-13 03:56:07 +00:00
|
|
|
#package = 'btrfs-tools'
|
2012-07-13 18:30:07 +00:00
|
|
|
if mount_options is None:
|
|
|
|
mount_options = ['noatime','user_subvol_rm_allowed']
|
2012-07-13 18:30:21 +00:00
|
|
|
if mkfs_options is None:
|
|
|
|
mkfs_options = ['-m', 'single',
|
|
|
|
'-l', '32768',
|
|
|
|
'-n', '32768']
|
2012-02-11 22:24:39 +00:00
|
|
|
if fs == 'xfs':
|
2013-07-13 03:56:07 +00:00
|
|
|
#package = 'xfsprogs'
|
2012-07-13 18:30:07 +00:00
|
|
|
if mount_options is None:
|
|
|
|
mount_options = ['noatime']
|
|
|
|
if mkfs_options is None:
|
|
|
|
mkfs_options = ['-f', '-i', 'size=2048']
|
2012-02-11 22:24:39 +00:00
|
|
|
if fs == 'ext4' or fs == 'ext3':
|
2012-07-13 18:30:07 +00:00
|
|
|
if mount_options is None:
|
|
|
|
mount_options = ['noatime','user_xattr']
|
2012-02-11 22:24:39 +00:00
|
|
|
|
2012-07-22 03:18:24 +00:00
|
|
|
if mount_options is None:
|
|
|
|
mount_options = []
|
|
|
|
if mkfs_options is None:
|
|
|
|
mkfs_options = []
|
2012-07-13 18:30:07 +00:00
|
|
|
mkfs = ['mkfs.%s' % fs] + mkfs_options
|
2012-02-11 22:24:39 +00:00
|
|
|
log.info('%s on %s on %s' % (mkfs, dev, remote))
|
|
|
|
if package is not None:
|
|
|
|
remote.run(
|
|
|
|
args=[
|
|
|
|
'sudo',
|
|
|
|
'apt-get', 'install', '-y', package
|
2013-02-06 19:16:52 +00:00
|
|
|
],
|
|
|
|
stdout=StringIO(),
|
2012-02-11 22:24:39 +00:00
|
|
|
)
|
2014-03-12 01:15:12 +00:00
|
|
|
|
|
|
|
try:
|
|
|
|
remote.run(args= ['yes', run.Raw('|')] + ['sudo'] + mkfs + [dev])
|
|
|
|
except run.CommandFailedError:
|
|
|
|
# Newer btfs-tools doesn't prompt for overwrite, use -f
|
|
|
|
if '-f' not in mount_options:
|
|
|
|
mkfs_options.append('-f')
|
|
|
|
mkfs = ['mkfs.%s' % fs] + mkfs_options
|
|
|
|
log.info('%s on %s on %s' % (mkfs, dev, remote))
|
|
|
|
remote.run(args= ['yes', run.Raw('|')] + ['sudo'] + mkfs + [dev])
|
|
|
|
|
2012-07-13 18:30:07 +00:00
|
|
|
log.info('mount %s on %s -o %s' % (dev, remote,
|
|
|
|
','.join(mount_options)))
|
2011-10-03 21:03:36 +00:00
|
|
|
remote.run(
|
|
|
|
args=[
|
|
|
|
'sudo',
|
|
|
|
'mount',
|
2012-02-11 22:24:39 +00:00
|
|
|
'-t', fs,
|
2012-07-13 18:30:07 +00:00
|
|
|
'-o', ','.join(mount_options),
|
2011-10-03 21:03:36 +00:00
|
|
|
dev,
|
2013-02-17 06:32:16 +00:00
|
|
|
os.path.join('/var/lib/ceph/osd', 'ceph-{id}'.format(id=id_)),
|
2011-10-03 21:03:36 +00:00
|
|
|
]
|
|
|
|
)
|
2013-01-23 02:27:41 +00:00
|
|
|
if not remote in ctx.disk_config.remote_to_roles_to_dev_mount_options:
|
|
|
|
ctx.disk_config.remote_to_roles_to_dev_mount_options[remote] = {}
|
|
|
|
ctx.disk_config.remote_to_roles_to_dev_mount_options[remote][id_] = mount_options
|
|
|
|
if not remote in ctx.disk_config.remote_to_roles_to_dev_fstype:
|
|
|
|
ctx.disk_config.remote_to_roles_to_dev_fstype[remote] = {}
|
|
|
|
ctx.disk_config.remote_to_roles_to_dev_fstype[remote][id_] = fs
|
2011-10-03 21:03:36 +00:00
|
|
|
devs_to_clean[remote].append(
|
|
|
|
os.path.join(
|
2013-02-17 06:32:16 +00:00
|
|
|
os.path.join('/var/lib/ceph/osd', 'ceph-{id}'.format(id=id_)),
|
2011-10-03 21:03:36 +00:00
|
|
|
)
|
|
|
|
)
|
|
|
|
|
|
|
|
for id_ in teuthology.roles_of_type(roles_for_host, 'osd'):
|
2011-06-16 17:36:15 +00:00
|
|
|
remote.run(
|
|
|
|
args=[
|
2013-09-05 21:41:27 +00:00
|
|
|
'sudo',
|
2013-01-03 04:44:33 +00:00
|
|
|
'MALLOC_CHECK_=3',
|
2013-09-06 19:22:29 +00:00
|
|
|
'adjust-ulimits',
|
2013-02-06 19:16:52 +00:00
|
|
|
'ceph-coverage',
|
2011-06-16 17:36:15 +00:00
|
|
|
coverage_dir,
|
2013-02-06 19:16:52 +00:00
|
|
|
'ceph-osd',
|
2011-06-16 17:36:15 +00:00
|
|
|
'--mkfs',
|
2013-02-17 06:32:16 +00:00
|
|
|
'--mkkey',
|
2011-06-16 17:36:15 +00:00
|
|
|
'-i', id_,
|
2013-01-23 20:37:39 +00:00
|
|
|
'--monmap', '{tdir}/monmap'.format(tdir=testdir),
|
2011-06-16 17:36:15 +00:00
|
|
|
],
|
|
|
|
)
|
2013-02-17 06:32:16 +00:00
|
|
|
|
|
|
|
|
|
|
|
log.info('Reading keys from all nodes...')
|
|
|
|
keys_fp = StringIO()
|
|
|
|
keys = []
|
|
|
|
for remote, roles_for_host in ctx.cluster.remotes.iteritems():
|
|
|
|
for type_ in ['mds','osd']:
|
|
|
|
for id_ in teuthology.roles_of_type(roles_for_host, type_):
|
|
|
|
data = teuthology.get_file(
|
|
|
|
remote=remote,
|
|
|
|
path='/var/lib/ceph/{type}/ceph-{id}/keyring'.format(
|
|
|
|
type=type_,
|
|
|
|
id=id_,
|
|
|
|
),
|
|
|
|
sudo=True,
|
|
|
|
)
|
|
|
|
keys.append((type_, id_, data))
|
|
|
|
keys_fp.write(data)
|
|
|
|
for remote, roles_for_host in ctx.cluster.remotes.iteritems():
|
|
|
|
for type_ in ['client']:
|
|
|
|
for id_ in teuthology.roles_of_type(roles_for_host, type_):
|
|
|
|
data = teuthology.get_file(
|
|
|
|
remote=remote,
|
|
|
|
path='/etc/ceph/ceph.client.{id}.keyring'.format(id=id_)
|
|
|
|
)
|
|
|
|
keys.append((type_, id_, data))
|
|
|
|
keys_fp.write(data)
|
|
|
|
|
|
|
|
log.info('Adding keys to all mons...')
|
|
|
|
writes = mons.run(
|
|
|
|
args=[
|
|
|
|
'sudo', 'tee', '-a',
|
|
|
|
keyring_path,
|
|
|
|
],
|
|
|
|
stdin=run.PIPE,
|
|
|
|
wait=False,
|
|
|
|
stdout=StringIO(),
|
|
|
|
)
|
|
|
|
keys_fp.seek(0)
|
|
|
|
teuthology.feed_many_stdins_and_close(keys_fp, writes)
|
|
|
|
run.wait(writes)
|
|
|
|
for type_, id_, data in keys:
|
|
|
|
run.wait(
|
|
|
|
mons.run(
|
|
|
|
args=[
|
|
|
|
'sudo',
|
2013-09-06 19:22:29 +00:00
|
|
|
'adjust-ulimits',
|
2013-02-17 06:32:16 +00:00
|
|
|
'ceph-coverage',
|
|
|
|
coverage_dir,
|
|
|
|
'ceph-authtool',
|
|
|
|
keyring_path,
|
|
|
|
'--name={type}.{id}'.format(
|
|
|
|
type=type_,
|
|
|
|
id=id_,
|
|
|
|
),
|
|
|
|
] + list(teuthology.generate_caps(type_)),
|
|
|
|
wait=False,
|
|
|
|
),
|
|
|
|
)
|
|
|
|
|
|
|
|
log.info('Running mkfs on mon nodes...')
|
|
|
|
for remote, roles_for_host in mons.remotes.iteritems():
|
|
|
|
for id_ in teuthology.roles_of_type(roles_for_host, 'mon'):
|
2013-02-22 01:04:06 +00:00
|
|
|
remote.run(
|
|
|
|
args=[
|
|
|
|
'sudo',
|
|
|
|
'mkdir',
|
|
|
|
'-p',
|
|
|
|
'/var/lib/ceph/mon/ceph-{id}'.format(id=id_),
|
|
|
|
],
|
|
|
|
)
|
2013-02-17 06:32:16 +00:00
|
|
|
remote.run(
|
|
|
|
args=[
|
2013-09-05 21:41:27 +00:00
|
|
|
'sudo',
|
2013-09-06 19:22:29 +00:00
|
|
|
'adjust-ulimits',
|
2013-02-17 06:32:16 +00:00
|
|
|
'ceph-coverage',
|
|
|
|
coverage_dir,
|
|
|
|
'ceph-mon',
|
|
|
|
'--mkfs',
|
|
|
|
'-i', id_,
|
|
|
|
'--monmap={tdir}/monmap'.format(tdir=testdir),
|
|
|
|
'--osdmap={tdir}/osdmap'.format(tdir=testdir),
|
|
|
|
'--keyring={kpath}'.format(kpath=keyring_path),
|
|
|
|
],
|
|
|
|
)
|
|
|
|
|
|
|
|
|
2011-06-16 17:36:15 +00:00
|
|
|
run.wait(
|
|
|
|
mons.run(
|
|
|
|
args=[
|
|
|
|
'rm',
|
|
|
|
'--',
|
2013-01-23 20:37:39 +00:00
|
|
|
'{tdir}/monmap'.format(tdir=testdir),
|
|
|
|
'{tdir}/osdmap'.format(tdir=testdir),
|
2011-06-16 17:36:15 +00:00
|
|
|
],
|
|
|
|
wait=False,
|
|
|
|
),
|
|
|
|
)
|
|
|
|
|
|
|
|
try:
|
|
|
|
yield
|
2013-08-30 15:58:10 +00:00
|
|
|
except Exception:
|
2013-03-29 19:19:46 +00:00
|
|
|
# we need to know this below
|
|
|
|
ctx.summary['success'] = False
|
|
|
|
raise
|
2011-06-16 17:36:15 +00:00
|
|
|
finally:
|
2014-03-27 16:35:28 +00:00
|
|
|
(mon0_remote,) = ctx.cluster.only(firstmon).remotes.keys()
|
2011-08-23 05:04:57 +00:00
|
|
|
|
2012-06-06 20:32:56 +00:00
|
|
|
log.info('Checking cluster log for badness...')
|
2011-10-17 21:42:03 +00:00
|
|
|
def first_in_ceph_log(pattern, excludes):
|
2013-10-12 08:28:27 +00:00
|
|
|
"""
|
|
|
|
Find the first occurence of the pattern specified in the Ceph log,
|
2013-11-21 19:56:41 +00:00
|
|
|
Returns None if none found.
|
|
|
|
|
2013-10-12 08:28:27 +00:00
|
|
|
:param pattern: Pattern scanned for.
|
|
|
|
:param excludes: Patterns to ignore.
|
|
|
|
:return: First line of text (or None if not found)
|
|
|
|
"""
|
2011-10-17 21:42:03 +00:00
|
|
|
args = [
|
2013-03-11 00:08:55 +00:00
|
|
|
'sudo',
|
2011-10-17 21:42:03 +00:00
|
|
|
'egrep', pattern,
|
2013-02-18 20:14:12 +00:00
|
|
|
'/var/log/ceph/ceph.log',
|
2011-10-17 21:42:03 +00:00
|
|
|
]
|
|
|
|
for exclude in excludes:
|
|
|
|
args.extend([run.Raw('|'), 'egrep', '-v', exclude])
|
|
|
|
args.extend([
|
|
|
|
run.Raw('|'), 'head', '-n', '1',
|
|
|
|
])
|
2011-10-03 23:08:49 +00:00
|
|
|
r = mon0_remote.run(
|
2011-08-24 04:00:26 +00:00
|
|
|
stdout=StringIO(),
|
2011-10-17 21:42:03 +00:00
|
|
|
args=args,
|
2011-08-24 04:00:26 +00:00
|
|
|
)
|
2011-10-03 23:08:49 +00:00
|
|
|
stdout = r.stdout.getvalue()
|
2011-10-17 21:42:03 +00:00
|
|
|
if stdout != '':
|
2011-10-03 23:08:49 +00:00
|
|
|
return stdout
|
|
|
|
return None
|
|
|
|
|
2011-10-17 21:42:03 +00:00
|
|
|
if first_in_ceph_log('\[ERR\]|\[WRN\]|\[SEC\]',
|
|
|
|
config['log_whitelist']) is not None:
|
2011-08-24 04:00:26 +00:00
|
|
|
log.warning('Found errors (ERR|WRN|SEC) in cluster log')
|
|
|
|
ctx.summary['success'] = False
|
2011-10-03 23:08:49 +00:00
|
|
|
# use the most severe problem as the failure reason
|
|
|
|
if 'failure_reason' not in ctx.summary:
|
|
|
|
for pattern in ['\[SEC\]', '\[ERR\]', '\[WRN\]']:
|
2011-10-17 21:42:03 +00:00
|
|
|
match = first_in_ceph_log(pattern, config['log_whitelist'])
|
2011-10-03 23:08:49 +00:00
|
|
|
if match is not None:
|
|
|
|
ctx.summary['failure_reason'] = \
|
|
|
|
'"{match}" in cluster log'.format(
|
|
|
|
match=match.rstrip('\n'),
|
|
|
|
)
|
|
|
|
break
|
2011-08-24 04:00:26 +00:00
|
|
|
|
2011-10-03 21:03:36 +00:00
|
|
|
for remote, dirs in devs_to_clean.iteritems():
|
|
|
|
for dir_ in dirs:
|
|
|
|
log.info('Unmounting %s on %s' % (dir_, remote))
|
|
|
|
remote.run(
|
|
|
|
args=[
|
2012-02-02 17:26:45 +00:00
|
|
|
'sync',
|
|
|
|
run.Raw('&&'),
|
|
|
|
'sudo',
|
|
|
|
'umount',
|
|
|
|
'-f',
|
2011-10-03 21:03:36 +00:00
|
|
|
dir_
|
|
|
|
]
|
|
|
|
)
|
|
|
|
|
2012-09-18 22:56:08 +00:00
|
|
|
if config.get('tmpfs_journal'):
|
|
|
|
log.info('tmpfs journal enabled - unmounting tmpfs at /mnt')
|
2012-09-18 23:31:39 +00:00
|
|
|
for remote, roles_for_host in osds.remotes.iteritems():
|
2012-09-18 22:56:08 +00:00
|
|
|
remote.run(
|
|
|
|
args=[ 'sudo', 'umount', '-f', '/mnt' ],
|
|
|
|
check_status=False,
|
|
|
|
)
|
2012-08-16 22:50:10 +00:00
|
|
|
|
2013-03-29 19:19:46 +00:00
|
|
|
if ctx.archive is not None and \
|
|
|
|
not (ctx.config.get('archive-on-error') and ctx.summary['success']):
|
2012-07-11 21:14:46 +00:00
|
|
|
# archive mon data, too
|
|
|
|
log.info('Archiving mon data...')
|
|
|
|
path = os.path.join(ctx.archive, 'data')
|
|
|
|
os.makedirs(path)
|
|
|
|
for remote, roles in mons.remotes.iteritems():
|
|
|
|
for role in roles:
|
|
|
|
if role.startswith('mon.'):
|
2013-02-17 06:32:16 +00:00
|
|
|
teuthology.pull_directory_tarball(
|
|
|
|
remote,
|
|
|
|
'/var/lib/ceph/mon',
|
|
|
|
path + '/' + role + '.tgz')
|
2012-07-11 21:14:46 +00:00
|
|
|
|
2013-02-17 07:44:03 +00:00
|
|
|
# and logs
|
|
|
|
log.info('Compressing logs...')
|
|
|
|
run.wait(
|
|
|
|
ctx.cluster.run(
|
|
|
|
args=[
|
|
|
|
'sudo',
|
|
|
|
'find',
|
|
|
|
'/var/log/ceph',
|
|
|
|
'-name',
|
|
|
|
'*.log',
|
|
|
|
'-print0',
|
|
|
|
run.Raw('|'),
|
|
|
|
'sudo',
|
|
|
|
'xargs',
|
|
|
|
'-0',
|
|
|
|
'--no-run-if-empty',
|
|
|
|
'--',
|
|
|
|
'gzip',
|
|
|
|
'--',
|
|
|
|
],
|
|
|
|
wait=False,
|
|
|
|
),
|
|
|
|
)
|
|
|
|
|
|
|
|
log.info('Archiving logs...')
|
|
|
|
path = os.path.join(ctx.archive, 'remote')
|
|
|
|
os.makedirs(path)
|
|
|
|
for remote in ctx.cluster.remotes.iterkeys():
|
|
|
|
sub = os.path.join(path, remote.shortname)
|
|
|
|
os.makedirs(sub)
|
|
|
|
teuthology.pull_directory(remote, '/var/log/ceph',
|
|
|
|
os.path.join(sub, 'log'))
|
|
|
|
|
|
|
|
|
2011-06-16 17:36:15 +00:00
|
|
|
log.info('Cleaning ceph cluster...')
|
|
|
|
run.wait(
|
|
|
|
ctx.cluster.run(
|
|
|
|
args=[
|
2013-02-06 19:16:52 +00:00
|
|
|
'sudo',
|
2011-06-16 17:36:15 +00:00
|
|
|
'rm',
|
|
|
|
'-rf',
|
|
|
|
'--',
|
2013-02-06 19:16:52 +00:00
|
|
|
conf_path,
|
|
|
|
keyring_path,
|
2013-01-23 20:37:39 +00:00
|
|
|
'{tdir}/data'.format(tdir=testdir),
|
|
|
|
'{tdir}/monmap'.format(tdir=testdir),
|
2011-06-16 17:36:15 +00:00
|
|
|
],
|
|
|
|
wait=False,
|
|
|
|
),
|
|
|
|
)
|
|
|
|
|
2014-02-04 01:17:09 +00:00
|
|
|
def get_all_pg_info(rem_site, testdir):
|
2014-01-31 23:29:02 +00:00
|
|
|
"""
|
2014-02-04 01:17:09 +00:00
|
|
|
Get the results of a ceph pg dump
|
2014-01-31 23:29:02 +00:00
|
|
|
"""
|
2014-02-04 01:17:09 +00:00
|
|
|
info = rem_site.run(args=[
|
2014-01-31 23:29:02 +00:00
|
|
|
'adjust-ulimits',
|
|
|
|
'ceph-coverage',
|
|
|
|
'{tdir}/archive/coverage'.format(tdir=testdir),
|
2014-02-04 01:17:09 +00:00
|
|
|
'ceph', 'pg', 'dump',
|
|
|
|
'--format', 'json'], stdout=StringIO())
|
|
|
|
all_info = json.loads(info.stdout.getvalue())
|
|
|
|
return all_info['pg_stats']
|
|
|
|
|
|
|
|
def osd_scrub_pgs(ctx, config):
|
|
|
|
"""
|
|
|
|
Scrub pgs when we exit.
|
|
|
|
|
|
|
|
First make sure all pgs are active and clean.
|
|
|
|
Next scrub all osds.
|
|
|
|
Then periodically check until all pgs have scrub time stamps that
|
|
|
|
indicate the last scrub completed. Time out if no progess is made
|
|
|
|
here after two minutes.
|
|
|
|
"""
|
|
|
|
retries = 12
|
|
|
|
delays = 10
|
|
|
|
vlist = ctx.cluster.remotes.values()
|
|
|
|
testdir = teuthology.get_testdir(ctx)
|
|
|
|
rem_site = ctx.cluster.remotes.keys()[0]
|
|
|
|
all_clean = False
|
|
|
|
for _ in range(0, retries):
|
|
|
|
stats = get_all_pg_info(rem_site, testdir)
|
|
|
|
states = [stat['state'] for stat in stats]
|
|
|
|
if len(set(states)) == 1 and states[0] == 'active+clean':
|
|
|
|
all_clean = True
|
|
|
|
break
|
|
|
|
log.info("Waiting for all osds to be active and clean.")
|
|
|
|
time.sleep(delays)
|
|
|
|
if not all_clean:
|
|
|
|
log.info("Scrubbing terminated -- not all pgs were active and clean.")
|
|
|
|
return
|
|
|
|
check_time_now = time.localtime()
|
|
|
|
time.sleep(1)
|
|
|
|
for slists in vlist:
|
|
|
|
for role in slists:
|
|
|
|
if role.startswith('osd.'):
|
|
|
|
log.info("Scrubbing osd {osd}".format(osd=role))
|
|
|
|
rem_site.run(args=[
|
|
|
|
'adjust-ulimits',
|
|
|
|
'ceph-coverage',
|
|
|
|
'{tdir}/archive/coverage'.format(tdir=testdir),
|
|
|
|
'ceph', 'osd', 'scrub', role])
|
|
|
|
prev_good = 0
|
|
|
|
gap_cnt = 0
|
|
|
|
loop = True
|
|
|
|
while loop:
|
|
|
|
stats = get_all_pg_info(rem_site, testdir)
|
|
|
|
timez = [stat['last_scrub_stamp'] for stat in stats]
|
|
|
|
loop = False
|
|
|
|
thiscnt = 0
|
|
|
|
for tmval in timez:
|
|
|
|
pgtm = time.strptime(tmval[0:tmval.find('.')], '%Y-%m-%d %H:%M:%S')
|
|
|
|
if pgtm > check_time_now:
|
|
|
|
thiscnt += 1
|
|
|
|
else:
|
|
|
|
loop = True
|
|
|
|
if thiscnt > prev_good:
|
|
|
|
prev_good = thiscnt
|
|
|
|
gap_cnt = 0
|
|
|
|
else:
|
|
|
|
gap_cnt += 1
|
|
|
|
if gap_cnt > retries:
|
|
|
|
log.info('Exiting scrub checking -- not all pgs scrubbed.')
|
|
|
|
return
|
|
|
|
if loop:
|
|
|
|
log.info('Still waiting for all pgs to be scrubbed.')
|
|
|
|
time.sleep(delays)
|
2011-06-16 17:36:15 +00:00
|
|
|
|
2011-06-16 18:30:33 +00:00
|
|
|
@contextlib.contextmanager
|
2012-02-02 17:27:11 +00:00
|
|
|
def run_daemon(ctx, config, type_):
|
2013-10-12 08:28:27 +00:00
|
|
|
"""
|
|
|
|
Run daemons for a role type. Handle the startup and termination of a a daemon.
|
|
|
|
On startup -- set coverages, cpu_profile, valgrind values for all remotes,
|
|
|
|
and a max_mds value for one mds.
|
|
|
|
On cleanup -- Stop all existing daemons of this type.
|
|
|
|
|
|
|
|
:param ctx: Context
|
|
|
|
:param config: Configuration
|
|
|
|
:paran type_: Role type
|
|
|
|
"""
|
2012-02-02 17:27:11 +00:00
|
|
|
log.info('Starting %s daemons...' % type_)
|
2013-01-23 20:37:39 +00:00
|
|
|
testdir = teuthology.get_testdir(ctx)
|
2012-02-02 17:27:11 +00:00
|
|
|
daemons = ctx.cluster.only(teuthology.is_type(type_))
|
2014-03-23 19:03:25 +00:00
|
|
|
|
|
|
|
# check whether any daemons if this type are configured
|
|
|
|
if daemons is None:
|
|
|
|
return
|
2013-01-23 20:37:39 +00:00
|
|
|
coverage_dir = '{tdir}/archive/coverage'.format(tdir=testdir)
|
2011-06-16 18:30:33 +00:00
|
|
|
|
|
|
|
daemon_signal = 'kill'
|
2012-02-24 19:21:04 +00:00
|
|
|
if config.get('coverage') or config.get('valgrind') is not None:
|
2011-06-16 18:30:33 +00:00
|
|
|
daemon_signal = 'term'
|
|
|
|
|
2011-12-19 22:12:39 +00:00
|
|
|
num_active = 0
|
|
|
|
for remote, roles_for_host in daemons.remotes.iteritems():
|
2012-02-02 17:27:11 +00:00
|
|
|
for id_ in teuthology.roles_of_type(roles_for_host, type_):
|
|
|
|
name = '%s.%s' % (type_, id_)
|
2011-12-20 22:10:22 +00:00
|
|
|
|
2013-01-09 22:02:42 +00:00
|
|
|
if not (id_.endswith('-s')) and (id_.find('-s-') == -1):
|
2011-12-19 22:12:39 +00:00
|
|
|
num_active += 1
|
|
|
|
|
2012-02-24 19:21:04 +00:00
|
|
|
run_cmd = [
|
2013-09-05 21:41:27 +00:00
|
|
|
'sudo',
|
2013-09-06 19:22:29 +00:00
|
|
|
'adjust-ulimits',
|
2013-02-06 19:16:52 +00:00
|
|
|
'ceph-coverage',
|
2012-02-24 19:21:04 +00:00
|
|
|
coverage_dir,
|
2013-09-06 21:49:05 +00:00
|
|
|
'daemon-helper',
|
2012-02-24 19:21:04 +00:00
|
|
|
daemon_signal,
|
|
|
|
]
|
2011-12-19 22:12:39 +00:00
|
|
|
run_cmd_tail = [
|
2013-02-06 19:16:52 +00:00
|
|
|
'ceph-%s' % (type_),
|
2011-12-19 22:12:39 +00:00
|
|
|
'-f',
|
2013-02-06 19:16:52 +00:00
|
|
|
'-i', id_]
|
2011-08-15 22:35:42 +00:00
|
|
|
|
2013-09-06 22:56:39 +00:00
|
|
|
if type_ in config.get('cpu_profile', []):
|
|
|
|
profile_path = '/var/log/ceph/profiling-logger/%s.%s.prof' % (type_, id_)
|
|
|
|
run_cmd.extend([ 'env', 'CPUPROFILE=%s' % profile_path ])
|
|
|
|
|
2012-02-24 19:21:04 +00:00
|
|
|
if config.get('valgrind') is not None:
|
|
|
|
valgrind_args = None
|
|
|
|
if type_ in config['valgrind']:
|
|
|
|
valgrind_args = config['valgrind'][type_]
|
|
|
|
if name in config['valgrind']:
|
|
|
|
valgrind_args = config['valgrind'][name]
|
2013-09-06 22:56:39 +00:00
|
|
|
run_cmd = teuthology.get_valgrind_args(testdir, name,
|
|
|
|
run_cmd,
|
|
|
|
valgrind_args)
|
2012-08-17 20:47:13 +00:00
|
|
|
|
2011-08-15 22:35:42 +00:00
|
|
|
run_cmd.extend(run_cmd_tail)
|
2013-01-09 22:02:42 +00:00
|
|
|
|
2012-02-02 17:27:11 +00:00
|
|
|
ctx.daemons.add_daemon(remote, type_, id_,
|
2011-12-20 22:10:22 +00:00
|
|
|
args=run_cmd,
|
|
|
|
logger=log.getChild(name),
|
|
|
|
stdin=run.PIPE,
|
|
|
|
wait=False,
|
|
|
|
)
|
2011-06-16 18:30:33 +00:00
|
|
|
|
2012-02-02 17:27:11 +00:00
|
|
|
if type_ == 'mds':
|
2011-12-19 22:12:39 +00:00
|
|
|
firstmon = teuthology.get_first_mon(ctx, config)
|
2014-03-27 16:35:28 +00:00
|
|
|
(mon0_remote,) = ctx.cluster.only(firstmon).remotes.keys()
|
2013-02-06 19:16:52 +00:00
|
|
|
|
2011-12-19 22:12:39 +00:00
|
|
|
mon0_remote.run(args=[
|
2013-09-06 19:22:29 +00:00
|
|
|
'adjust-ulimits',
|
2013-02-06 19:16:52 +00:00
|
|
|
'ceph-coverage',
|
2011-07-28 17:28:57 +00:00
|
|
|
coverage_dir,
|
2013-02-06 19:16:52 +00:00
|
|
|
'ceph',
|
2011-07-28 17:25:30 +00:00
|
|
|
'mds', 'set_max_mds', str(num_active)])
|
2011-07-06 23:44:46 +00:00
|
|
|
|
2011-06-16 18:37:51 +00:00
|
|
|
try:
|
|
|
|
yield
|
|
|
|
finally:
|
2013-04-30 23:35:11 +00:00
|
|
|
teuthology.stop_daemons_of_type(ctx, type_)
|
2011-06-16 18:37:51 +00:00
|
|
|
|
2011-06-16 19:18:58 +00:00
|
|
|
def healthy(ctx, config):
|
2013-10-12 08:28:27 +00:00
|
|
|
"""
|
|
|
|
Wait for all osd's to be up, and for the ceph health monitor to return HEALTH_OK.
|
|
|
|
|
|
|
|
:param ctx: Context
|
|
|
|
:param config: Configuration
|
|
|
|
"""
|
2011-06-16 19:18:58 +00:00
|
|
|
log.info('Waiting until ceph is healthy...')
|
2011-08-31 20:56:42 +00:00
|
|
|
firstmon = teuthology.get_first_mon(ctx, config)
|
2014-03-27 16:35:28 +00:00
|
|
|
(mon0_remote,) = ctx.cluster.only(firstmon).remotes.keys()
|
2012-01-08 23:14:18 +00:00
|
|
|
teuthology.wait_until_osds_up(
|
2013-01-23 20:37:39 +00:00
|
|
|
ctx,
|
2012-01-08 23:14:18 +00:00
|
|
|
cluster=ctx.cluster,
|
|
|
|
remote=mon0_remote
|
|
|
|
)
|
2011-06-16 19:18:58 +00:00
|
|
|
teuthology.wait_until_healthy(
|
2013-01-23 20:37:39 +00:00
|
|
|
ctx,
|
2011-06-16 19:18:58 +00:00
|
|
|
remote=mon0_remote,
|
|
|
|
)
|
|
|
|
|
2013-07-23 23:55:33 +00:00
|
|
|
def wait_for_osds_up(ctx, config):
|
2013-10-12 08:28:27 +00:00
|
|
|
"""
|
|
|
|
Wait for all osd's to come up.
|
|
|
|
|
|
|
|
:param ctx: Context
|
|
|
|
:param config: Configuration
|
|
|
|
"""
|
2013-07-23 23:55:33 +00:00
|
|
|
log.info('Waiting until ceph osds are all up...')
|
|
|
|
firstmon = teuthology.get_first_mon(ctx, config)
|
2014-03-27 16:35:28 +00:00
|
|
|
(mon0_remote,) = ctx.cluster.only(firstmon).remotes.keys()
|
2013-07-23 23:55:33 +00:00
|
|
|
teuthology.wait_until_osds_up(
|
|
|
|
ctx,
|
|
|
|
cluster=ctx.cluster,
|
|
|
|
remote=mon0_remote
|
|
|
|
)
|
|
|
|
|
2013-07-23 21:43:56 +00:00
|
|
|
def wait_for_mon_quorum(ctx, config):
|
2013-10-12 08:28:27 +00:00
|
|
|
"""
|
|
|
|
Check renote ceph status until all monitors are up.
|
|
|
|
|
|
|
|
:param ctx: Context
|
|
|
|
:param config: Configuration
|
|
|
|
"""
|
2013-07-23 21:43:56 +00:00
|
|
|
|
|
|
|
assert isinstance(config, list)
|
|
|
|
firstmon = teuthology.get_first_mon(ctx, config)
|
2014-03-27 16:35:28 +00:00
|
|
|
(remote,) = ctx.cluster.only(firstmon).remotes.keys()
|
2013-07-23 21:43:56 +00:00
|
|
|
while True:
|
|
|
|
r = remote.run(
|
|
|
|
args=[
|
|
|
|
'ceph',
|
|
|
|
'quorum_status',
|
|
|
|
],
|
|
|
|
stdout=StringIO(),
|
|
|
|
logger=log.getChild('quorum_status'),
|
|
|
|
)
|
|
|
|
j = json.loads(r.stdout.getvalue())
|
|
|
|
q = j.get('quorum_names', [])
|
|
|
|
log.debug('Quorum: %s', q)
|
|
|
|
if sorted(q) == sorted(config):
|
|
|
|
break
|
|
|
|
time.sleep(1)
|
|
|
|
|
2011-06-16 19:18:58 +00:00
|
|
|
|
2013-03-15 01:18:39 +00:00
|
|
|
@contextlib.contextmanager
|
|
|
|
def restart(ctx, config):
|
2013-10-12 08:28:27 +00:00
|
|
|
"""
|
2013-03-15 01:18:39 +00:00
|
|
|
restart ceph daemons
|
|
|
|
|
|
|
|
For example::
|
|
|
|
tasks:
|
|
|
|
- ceph.restart: [all]
|
|
|
|
|
|
|
|
For example::
|
|
|
|
tasks:
|
|
|
|
- ceph.restart: [osd.0, mon.1]
|
|
|
|
|
2013-07-24 20:14:02 +00:00
|
|
|
or::
|
|
|
|
|
|
|
|
tasks:
|
|
|
|
- ceph.restart:
|
|
|
|
daemons: [osd.0, mon.1]
|
|
|
|
wait-for-healthy: false
|
|
|
|
wait-for-osds-up: true
|
|
|
|
|
2013-10-12 08:28:27 +00:00
|
|
|
:param ctx: Context
|
|
|
|
:param config: Configuration
|
|
|
|
"""
|
|
|
|
if config is None:
|
|
|
|
config = {}
|
|
|
|
if isinstance(config, list):
|
|
|
|
config = { 'daemons': config }
|
|
|
|
if 'daemons' not in config:
|
|
|
|
config['daemons'] = []
|
|
|
|
type_daemon = ['mon', 'osd', 'mds', 'rgw']
|
|
|
|
for d in type_daemon:
|
|
|
|
type_ = d
|
|
|
|
for daemon in ctx.daemons.iter_daemons_of_role(type_):
|
|
|
|
config['daemons'].append(type_ + '.' + daemon.id_)
|
|
|
|
|
|
|
|
assert isinstance(config['daemons'], list)
|
|
|
|
daemons = dict.fromkeys(config['daemons'])
|
|
|
|
for i in daemons.keys():
|
|
|
|
type_ = i.split('.', 1)[0]
|
|
|
|
id_ = i.split('.', 1)[1]
|
|
|
|
ctx.daemons.get_daemon(type_, id_).stop()
|
|
|
|
ctx.daemons.get_daemon(type_, id_).restart()
|
|
|
|
|
|
|
|
if config.get('wait-for-healthy', True):
|
|
|
|
healthy(ctx=ctx, config=None)
|
|
|
|
if config.get('wait-for-osds-up', False):
|
|
|
|
wait_for_osds_up(ctx=ctx, config=None)
|
|
|
|
yield
|
2013-03-15 01:18:39 +00:00
|
|
|
|
2011-06-03 21:47:44 +00:00
|
|
|
@contextlib.contextmanager
|
|
|
|
def task(ctx, config):
|
2011-06-06 21:22:49 +00:00
|
|
|
"""
|
|
|
|
Set up and tear down a Ceph cluster.
|
|
|
|
|
|
|
|
For example::
|
|
|
|
|
|
|
|
tasks:
|
|
|
|
- ceph:
|
|
|
|
- interactive:
|
2011-06-09 21:08:45 +00:00
|
|
|
|
|
|
|
You can also specify what branch to run::
|
|
|
|
|
|
|
|
tasks:
|
|
|
|
- ceph:
|
|
|
|
branch: foo
|
|
|
|
|
|
|
|
Or a tag::
|
|
|
|
|
|
|
|
tasks:
|
|
|
|
- ceph:
|
|
|
|
tag: v0.42.13
|
|
|
|
|
2011-06-10 00:05:55 +00:00
|
|
|
Or a sha1::
|
|
|
|
|
|
|
|
tasks:
|
|
|
|
- ceph:
|
|
|
|
sha1: 1376a5ab0c89780eab39ffbbe436f6a6092314ed
|
|
|
|
|
2011-06-30 21:49:42 +00:00
|
|
|
Or a local source dir::
|
|
|
|
|
|
|
|
tasks:
|
|
|
|
- ceph:
|
|
|
|
path: /home/sage/ceph
|
|
|
|
|
2011-06-09 22:43:43 +00:00
|
|
|
To capture code coverage data, use::
|
|
|
|
|
|
|
|
tasks:
|
|
|
|
- ceph:
|
|
|
|
coverage: true
|
|
|
|
|
2012-02-11 22:24:39 +00:00
|
|
|
To use btrfs, ext4, or xfs on the target's scratch disks, use::
|
|
|
|
|
2011-10-03 21:03:36 +00:00
|
|
|
tasks:
|
|
|
|
- ceph:
|
2012-07-13 01:02:29 +00:00
|
|
|
fs: xfs
|
|
|
|
mkfs_options: [-b,size=65536,-l,logdev=/dev/sdc1]
|
|
|
|
mount_options: [nobarrier, inode64]
|
2012-02-11 22:24:39 +00:00
|
|
|
|
2011-10-03 21:03:36 +00:00
|
|
|
Note, this will cause the task to check the /scratch_devs file on each node
|
|
|
|
for available devices. If no such file is found, /dev/sdb will be used.
|
|
|
|
|
2011-08-17 17:35:37 +00:00
|
|
|
To run some daemons under valgrind, include their names
|
2011-12-20 22:10:22 +00:00
|
|
|
and the tool/args to use in a valgrind section::
|
|
|
|
|
2011-08-17 17:35:37 +00:00
|
|
|
tasks:
|
|
|
|
- ceph:
|
|
|
|
valgrind:
|
|
|
|
mds.1: --tool=memcheck
|
2011-12-20 22:10:22 +00:00
|
|
|
osd.1: [--tool=memcheck, --leak-check=no]
|
|
|
|
|
2012-07-13 01:02:29 +00:00
|
|
|
Those nodes which are using memcheck or valgrind will get
|
2011-08-29 20:58:09 +00:00
|
|
|
checked for bad results.
|
2011-08-17 17:35:37 +00:00
|
|
|
|
2011-07-05 23:45:32 +00:00
|
|
|
To adjust or modify config options, use::
|
|
|
|
|
|
|
|
tasks:
|
|
|
|
- ceph:
|
|
|
|
conf:
|
|
|
|
section:
|
|
|
|
key: value
|
|
|
|
|
|
|
|
For example::
|
|
|
|
|
|
|
|
tasks:
|
|
|
|
- ceph:
|
|
|
|
conf:
|
|
|
|
mds.0:
|
|
|
|
some option: value
|
|
|
|
other key: other value
|
|
|
|
client.0:
|
|
|
|
debug client: 10
|
|
|
|
debug ms: 1
|
|
|
|
|
2011-10-17 21:42:03 +00:00
|
|
|
By default, the cluster log is checked for errors and warnings,
|
|
|
|
and the run marked failed if any appear. You can ignore log
|
|
|
|
entries by giving a list of egrep compatible regexes, i.e.:
|
|
|
|
|
|
|
|
tasks:
|
|
|
|
- ceph:
|
|
|
|
log-whitelist: ['foo.*bar', 'bad message']
|
|
|
|
|
2013-10-12 08:28:27 +00:00
|
|
|
:param ctx: Context
|
|
|
|
:param config: Configuration
|
2011-06-06 21:22:49 +00:00
|
|
|
"""
|
2011-06-09 21:08:45 +00:00
|
|
|
if config is None:
|
|
|
|
config = {}
|
2011-06-07 18:45:29 +00:00
|
|
|
assert isinstance(config, dict), \
|
|
|
|
"task ceph only supports a dictionary for configuration"
|
2011-06-03 16:48:22 +00:00
|
|
|
|
2011-07-16 00:15:09 +00:00
|
|
|
overrides = ctx.config.get('overrides', {})
|
2011-11-17 21:07:03 +00:00
|
|
|
teuthology.deep_merge(config, overrides.get('ceph', {}))
|
2011-07-16 00:15:09 +00:00
|
|
|
|
2011-09-14 23:31:58 +00:00
|
|
|
ctx.daemons = CephState()
|
2012-01-31 15:59:26 +00:00
|
|
|
|
2013-01-23 20:37:39 +00:00
|
|
|
testdir = teuthology.get_testdir(ctx)
|
2011-08-24 23:48:14 +00:00
|
|
|
if config.get('coverage'):
|
2013-01-23 20:37:39 +00:00
|
|
|
coverage_dir = '{tdir}/archive/coverage'.format(tdir=testdir)
|
2011-08-24 23:48:14 +00:00
|
|
|
log.info('Creating coverage directory...')
|
|
|
|
run.wait(
|
|
|
|
ctx.cluster.run(
|
|
|
|
args=[
|
|
|
|
'install', '-d', '-m0755', '--',
|
|
|
|
coverage_dir,
|
|
|
|
],
|
|
|
|
wait=False,
|
|
|
|
)
|
2011-05-31 20:51:48 +00:00
|
|
|
)
|
|
|
|
|
2011-06-15 21:57:02 +00:00
|
|
|
with contextutil.nested(
|
2011-06-16 20:13:32 +00:00
|
|
|
lambda: ceph_log(ctx=ctx, config=None),
|
2011-06-15 21:57:02 +00:00
|
|
|
lambda: ship_utilities(ctx=ctx, config=None),
|
2011-08-29 20:58:09 +00:00
|
|
|
lambda: valgrind_post(ctx=ctx, config=config),
|
2011-07-05 23:45:32 +00:00
|
|
|
lambda: cluster(ctx=ctx, config=dict(
|
2011-10-03 21:03:36 +00:00
|
|
|
conf=config.get('conf', {}),
|
2012-02-11 22:24:39 +00:00
|
|
|
fs=config.get('fs', None),
|
2012-07-13 01:02:29 +00:00
|
|
|
mkfs_options=config.get('mkfs_options', None),
|
|
|
|
mount_options=config.get('mount_options',None),
|
2012-03-27 22:05:11 +00:00
|
|
|
block_journal=config.get('block_journal', None),
|
2012-08-16 22:50:10 +00:00
|
|
|
tmpfs_journal=config.get('tmpfs_journal', None),
|
2011-10-17 21:42:03 +00:00
|
|
|
log_whitelist=config.get('log-whitelist', []),
|
2012-08-17 20:47:13 +00:00
|
|
|
cpu_profile=set(config.get('cpu_profile', [])),
|
2011-07-05 23:45:32 +00:00
|
|
|
)),
|
2012-02-02 17:27:11 +00:00
|
|
|
lambda: run_daemon(ctx=ctx, config=config, type_='mon'),
|
|
|
|
lambda: run_daemon(ctx=ctx, config=config, type_='osd'),
|
|
|
|
lambda: run_daemon(ctx=ctx, config=config, type_='mds'),
|
2011-06-15 21:57:02 +00:00
|
|
|
):
|
2014-02-07 22:54:23 +00:00
|
|
|
try:
|
2014-02-10 17:28:39 +00:00
|
|
|
if config.get('wait-for-healthy', True):
|
|
|
|
healthy(ctx=ctx, config=None)
|
2014-02-07 22:54:23 +00:00
|
|
|
yield
|
|
|
|
finally:
|
|
|
|
osd_scrub_pgs(ctx, config)
|