ceph/tasks/cephfs/vstart_runner.py
John Spray 62247f2009 tasks/cephfs: fix FuseMount bin path in vstart
FuseMount only uses the prefix for finding the 'ceph'
executable, which is in ./ for either cmake or
authtools, not ./src for cmake like other binaries.

Signed-off-by: John Spray <john.spray@redhat.com>
2015-10-02 17:41:18 +01:00

892 lines
29 KiB
Python

"""
Useful hack: override Filesystem and Mount interfaces to run a CephFSTestCase against a vstart
ceph instance instead of a packaged/installed cluster. Use this to turn around test cases
quickly during development.
For example, if you have teuthology, ceph-qa-suite and ceph all in ~git, then you would:
# Activate the teuthology virtualenv
source ~/git/teuthology/virtualenv/bin/activate
# Go into your ceph source tree
cd ~/git/ceph/src
# Start a vstart cluster
MDS=2 MON=1 OSD=3 ./vstart.sh -n
# Invoke a test using this script, with PYTHONPATH set appropriately
PYTHONPATH=~/git/teuthology/:~/git/ceph-qa-suite/ python ~/git/ceph-qa-suite/tasks/cephfs/vstart_runner.py
If you built out of tree with CMake, then switch to your build directory before executing vstart_runner.
"""
from StringIO import StringIO
from collections import defaultdict
import getpass
import signal
import tempfile
import threading
import datetime
import shutil
import re
import os
import time
import json
import sys
import errno
from unittest import suite
import unittest
from teuthology.orchestra.run import Raw, quote
import logging
log = logging.getLogger(__name__)
handler = logging.FileHandler("./vstart_runner.log")
formatter = logging.Formatter(
fmt=u'%(asctime)s.%(msecs)03d %(levelname)s:%(name)s:%(message)s',
datefmt='%Y-%m-%dT%H:%M:%S')
handler.setFormatter(formatter)
log.addHandler(handler)
log.setLevel(logging.INFO)
try:
from teuthology.exceptions import CommandFailedError
from tasks.ceph_manager import CephManager
from tasks.cephfs.fuse_mount import FuseMount
from tasks.cephfs.filesystem import Filesystem
from teuthology.contextutil import MaxWhileTries
from teuthology.task import interactive
except ImportError:
sys.stderr.write("***\nError importing packages, have you activated your teuthology virtualenv "
"and set PYTHONPATH to point to teuthology and ceph-qa-suite?\n***\n\n")
raise
# Must import after teuthology because of gevent monkey patching
import subprocess
if os.path.exists("./CMakeCache.txt"):
# Running in build dir of a cmake build
BIN_PREFIX = "./src/"
else:
# Running in src/ of an autotools build
BIN_PREFIX = "./"
class LocalRemoteProcess(object):
def __init__(self, args, subproc, check_status, stdout, stderr):
self.args = args
self.subproc = subproc
if stdout is None:
self.stdout = StringIO()
else:
self.stdout = stdout
if stderr is None:
self.stderr = StringIO()
else:
self.stderr = stderr
self.check_status = check_status
self.exitstatus = self.returncode = None
def wait(self):
if self.finished:
# Avoid calling communicate() on a dead process because it'll
# give you stick about std* already being closed
return
out, err = self.subproc.communicate()
self.stdout.write(out)
self.stderr.write(err)
self.exitstatus = self.returncode = self.subproc.returncode
if self.exitstatus != 0:
sys.stderr.write(out)
sys.stderr.write(err)
if self.check_status and self.exitstatus != 0:
raise CommandFailedError(self.args, self.exitstatus)
@property
def finished(self):
if self.exitstatus is not None:
return True
if self.subproc.poll() is not None:
out, err = self.subproc.communicate()
self.stdout.write(out)
self.stderr.write(err)
self.exitstatus = self.returncode = self.subproc.returncode
return True
else:
return False
def kill(self):
log.info("kill ")
if self.subproc.pid and not self.finished:
log.info("kill: killing pid {0} ({1})".format(
self.subproc.pid, self.args))
safe_kill(self.subproc.pid)
else:
log.info("kill: already terminated ({0})".format(self.args))
@property
def stdin(self):
class FakeStdIn(object):
def __init__(self, mount_daemon):
self.mount_daemon = mount_daemon
def close(self):
self.mount_daemon.kill()
return FakeStdIn(self)
class LocalRemote(object):
"""
Amusingly named class to present the teuthology RemoteProcess interface when we are really
running things locally for vstart
Run this inside your src/ dir!
"""
def __init__(self):
self.name = "local"
self.hostname = "localhost"
self.user = getpass.getuser()
def get_file(self, path, sudo, dest_dir):
tmpfile = tempfile.NamedTemporaryFile(delete=False).name
shutil.copy(path, tmpfile)
return tmpfile
def run(self, args, check_status=True, wait=True,
stdout=None, stderr=None, cwd=None, stdin=None,
logger=None, label=None):
log.info("run args={0}".format(args))
# We don't need no stinkin' sudo
args = [a for a in args if a != "sudo"]
# We have to use shell=True if any run.Raw was present, e.g. &&
shell = any([a for a in args if isinstance(a, Raw)])
if shell:
filtered = []
i = 0
while i < len(args):
if args[i] == 'adjust-ulimits':
i += 1
elif args[i] == 'ceph-coverage':
i += 2
elif args[i] == 'timeout':
i += 2
else:
filtered.append(args[i])
i += 1
args = quote(filtered)
log.info("Running {0}".format(args))
subproc = subprocess.Popen(args,
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
stdin=subprocess.PIPE,
cwd=cwd,
shell=True)
else:
log.info("Running {0}".format(args))
for arg in args:
if not isinstance(arg, basestring):
raise RuntimeError("Oops, can't handle arg {0} type {1}".format(
arg, arg.__class__
))
subproc = subprocess.Popen(args,
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
stdin=subprocess.PIPE,
cwd=cwd)
if stdin:
if not isinstance(stdin, basestring):
raise RuntimeError("Can't handle non-string stdins on a vstart cluster")
# Hack: writing to stdin is not deadlock-safe, but it "always" works
# as long as the input buffer is "small"
subproc.stdin.write(stdin)
proc = LocalRemoteProcess(
args, subproc, check_status,
stdout, stderr
)
if wait:
proc.wait()
return proc
# FIXME: twiddling vstart daemons is likely to be unreliable, we should probably just let vstart
# run RADOS and run the MDS daemons directly from the test runner
class LocalDaemon(object):
def __init__(self, daemon_type, daemon_id):
self.daemon_type = daemon_type
self.daemon_id = daemon_id
self.controller = LocalRemote()
@property
def remote(self):
return LocalRemote()
def running(self):
return self._get_pid() is not None
def _get_pid(self):
"""
Return PID as an integer or None if not found
"""
ps_txt = self.controller.run(
args=["ps", "aux"]
).stdout.getvalue().strip()
lines = ps_txt.split("\n")[1:]
for line in lines:
if line.find("ceph-{0} -i {1}".format(self.daemon_type, self.daemon_id)) != -1:
log.info("Found ps line for daemon: {0}".format(line))
return int(line.split()[1])
return None
def wait(self, timeout):
waited = 0
while self._get_pid() is not None:
if waited > timeout:
raise MaxWhileTries("Timed out waiting for daemon {0}.{1}".format(self.daemon_type, self.daemon_id))
time.sleep(1)
waited += 1
def stop(self, timeout=300):
if not self.running():
log.error('tried to stop a non-running daemon')
return
pid = self._get_pid()
log.info("Killing PID {0} for {1}.{2}".format(pid, self.daemon_type, self.daemon_id))
os.kill(pid, signal.SIGKILL)
self.wait(timeout=timeout)
def restart(self):
if self._get_pid() is not None:
self.stop()
self.controller.run([os.path.join(BIN_PREFIX, "./ceph-{0}".format(self.daemon_type)), "-i", self.daemon_id])
def safe_kill(pid):
"""
os.kill annoyingly raises exception if process already dead. Ignore it.
"""
try:
return os.kill(pid, signal.SIGKILL)
except OSError as e:
if e.errno == errno.ESRCH:
# Raced with process termination
pass
else:
raise
class MountDaemon(object):
"""
Impersonate the fuse_daemon member of FuseMount
"""
def __init__(self, client_id):
self.controller = LocalRemote()
self.client_id = client_id
def _get_pid(self):
"""
Return PID as an integer or None if not found
"""
ps_txt = self.controller.run(
args=["ps", "ua", "-C", "ceph-fuse"],
check_status=False # ps returns err if nothing running so ignore
).stdout.getvalue().strip()
lines = ps_txt.split("\n")[1:]
for line in lines:
if line.find("--name client.{0} ".format(self.client_id)) != -1:
return int(line.split()[1])
return None
def poll(self):
return self._get_pid() is None
@property
def finished(self):
return self._get_pid() is None
def wait(self):
while self._get_pid() is not None:
time.sleep(1)
def kill(self):
pid = self._get_pid()
if pid is None:
return
else:
safe_kill(pid)
@property
def stdin(self):
class FakeStdIn(object):
def __init__(self, mount_daemon):
self.mount_daemon = mount_daemon
def close(self):
self.mount_daemon.kill()
return FakeStdIn(self)
class LocalFuseMount(FuseMount):
def __init__(self, client_id, mount_point):
test_dir = "/tmp/not_there"
super(LocalFuseMount, self).__init__(None, test_dir, client_id, LocalRemote())
self.mountpoint = mount_point
def run_shell(self, args, wait=True):
# FIXME maybe should add a pwd arg to teuthology.orchestra so that
# the "cd foo && bar" shenanigans isn't needed to begin with and
# then we wouldn't have to special case this
return self.client_remote.run(
args, wait=wait, cwd=self.mountpoint
)
@property
def _prefix(self):
# FuseMount only uses the prefix for running ceph, which in cmake or autotools is in
# the present path
return "./"
def _asok_path(self):
# In teuthology, the asok is named after the PID of the ceph-fuse process, because it's
# run foreground. When running it daemonized however, the asok is named after
# the PID of the launching process, not the long running ceph-fuse process. Therefore
# we need to give an exact path here as the logic for checking /proc/ for which
# asok is alive does not work.
path = "./out/client.{0}.{1}.asok".format(self.client_id, self._proc.subproc.pid)
log.info("I think my launching pid was {0}".format(self._proc.subproc.pid))
return path
def umount(self):
if self.is_mounted():
super(LocalFuseMount, self).umount()
def mount(self):
self.client_remote.run(
args=[
'mkdir',
'--',
self.mountpoint,
],
)
def list_connections():
self.client_remote.run(
args=["mount", "-t", "fusectl", "/sys/fs/fuse/connections", "/sys/fs/fuse/connections"],
check_status=False
)
p = self.client_remote.run(
args=["ls", "/sys/fs/fuse/connections"],
check_status=False
)
if p.exitstatus != 0:
log.warn("ls conns failed with {0}, assuming none".format(p.exitstatus))
return []
ls_str = p.stdout.getvalue().strip()
if ls_str:
return [int(n) for n in ls_str.split("\n")]
else:
return []
# Before starting ceph-fuse process, note the contents of
# /sys/fs/fuse/connections
pre_mount_conns = list_connections()
log.info("Pre-mount connections: {0}".format(pre_mount_conns))
prefix = [os.path.join(BIN_PREFIX, "ceph-fuse")]
if os.getuid() != 0:
prefix += ["--client-die-on-failed-remount=false"]
self._proc = self.client_remote.run(args=
prefix + [
"--name",
"client.{0}".format(self.client_id),
self.mountpoint
])
log.info("Mounted client.{0} with pid {1}".format(self.client_id, self._proc.subproc.pid))
self.fuse_daemon = MountDaemon(self.client_id)
# Wait for the connection reference to appear in /sys
waited = 0
post_mount_conns = list_connections()
while len(post_mount_conns) <= len(pre_mount_conns):
time.sleep(1)
waited += 1
if waited > 30:
raise RuntimeError("Fuse mount failed to populate /sys/ after {0} seconds".format(
waited
))
post_mount_conns = list_connections()
log.info("Post-mount connections: {0}".format(post_mount_conns))
# Record our fuse connection number so that we can use it when
# forcing an unmount
new_conns = list(set(post_mount_conns) - set(pre_mount_conns))
if len(new_conns) == 0:
raise RuntimeError("New fuse connection directory not found ({0})".format(new_conns))
elif len(new_conns) > 1:
raise RuntimeError("Unexpectedly numerous fuse connections {0}".format(new_conns))
else:
self._fuse_conn = new_conns[0]
def _run_python(self, pyscript):
"""
Override this to remove the daemon-helper prefix that is used otherwise
to make the process killable.
"""
return self.client_remote.run(args=[
'python', '-c', pyscript
], wait=False)
class LocalCephManager(CephManager):
def __init__(self):
# Deliberately skip parent init, only inheriting from it to get
# util methods like osd_dump that sit on top of raw_cluster_cmd
self.controller = LocalRemote()
# A minority of CephManager fns actually bother locking for when
# certain teuthology tests want to run tasks in parallel
self.lock = threading.RLock()
def find_remote(self, daemon_type, daemon_id):
"""
daemon_type like 'mds', 'osd'
daemon_id like 'a', '0'
"""
return LocalRemote()
def raw_cluster_cmd(self, *args):
"""
args like ["osd", "dump"}
return stdout string
"""
proc = self.controller.run(["./ceph"] + list(args))
return proc.stdout.getvalue()
def raw_cluster_cmd_result(self, *args):
"""
like raw_cluster_cmd but don't check status, just return rc
"""
proc = self.controller.run(["./ceph"] + list(args), check_status=False)
return proc.exitstatus
def admin_socket(self, daemon_type, daemon_id, command, check_status=True):
return self.controller.run(
args=["./ceph", "daemon", "{0}.{1}".format(daemon_type, daemon_id)] + command, check_status=check_status
)
# FIXME: copypasta
def get_mds_status(self, mds):
"""
Run cluster commands for the mds in order to get mds information
"""
out = self.raw_cluster_cmd('mds', 'dump', '--format=json')
j = json.loads(' '.join(out.splitlines()[1:]))
# collate; for dup ids, larger gid wins.
for info in j['info'].itervalues():
if info['name'] == mds:
return info
return None
# FIXME: copypasta
def get_mds_status_by_rank(self, rank):
"""
Run cluster commands for the mds in order to get mds information
check rank.
"""
j = self.get_mds_status_all()
# collate; for dup ids, larger gid wins.
for info in j['info'].itervalues():
if info['rank'] == rank:
return info
return None
def get_mds_status_all(self):
"""
Run cluster command to extract all the mds status.
"""
out = self.raw_cluster_cmd('mds', 'dump', '--format=json')
j = json.loads(' '.join(out.splitlines()[1:]))
return j
class LocalFilesystem(Filesystem):
def __init__(self, ctx):
# Deliberately skip calling parent constructor
self._ctx = ctx
self.admin_remote = LocalRemote()
# Hack: cheeky inspection of ceph.conf to see what MDSs exist
self.mds_ids = set()
for line in open("ceph.conf").readlines():
match = re.match("^\[mds\.(.+)\]$", line)
if match:
self.mds_ids.add(match.group(1))
if not self.mds_ids:
raise RuntimeError("No MDSs found in ceph.conf!")
self.mds_ids = list(self.mds_ids)
log.info("Discovered MDS IDs: {0}".format(self.mds_ids))
self.mon_manager = LocalCephManager()
self.mds_daemons = dict([(id_, LocalDaemon("mds", id_)) for id_ in self.mds_ids])
self.client_remote = LocalRemote()
self._conf = defaultdict(dict)
@property
def _prefix(self):
return BIN_PREFIX
def set_clients_block(self, blocked, mds_id=None):
raise NotImplementedError()
def get_pgs_per_fs_pool(self):
# FIXME: assuming there are 3 OSDs
return 3 * int(self.get_config('mon_pg_warn_min_per_osd'))
def get_config(self, key, service_type=None):
if service_type is None:
service_type = 'mon'
# FIXME hardcoded vstart service IDs
service_id = {
'mon': 'a',
'mds': 'a',
'osd': '0'
}[service_type]
return self.json_asok(['config', 'get', key], service_type, service_id)[key]
def _write_conf(self):
# In teuthology, we have the honour of writing the entire ceph.conf, but
# in vstart land it has mostly already been written and we need to carefully
# append to it.
conf_path = "./ceph.conf"
banner = "\n#LOCAL_TEST\n"
existing_str = open(conf_path).read()
if banner in existing_str:
existing_str = existing_str[0:existing_str.find(banner)]
existing_str += banner
for subsys, kvs in self._conf.items():
existing_str += "\n[{0}]\n".format(subsys)
for key, val in kvs.items():
# comment out any existing instances
if key in existing_str:
existing_str = existing_str.replace(key, "#{0}".format(key))
existing_str += "{0} = {1}\n".format(key, val)
open(conf_path, "w").write(existing_str)
def set_ceph_conf(self, subsys, key, value):
self._conf[subsys][key] = value
self._write_conf()
def clear_ceph_conf(self, subsys, key):
del self._conf[subsys][key]
self._write_conf()
def clear_firewall(self):
# FIXME: unimplemented
pass
class InteractiveFailureResult(unittest.TextTestResult):
"""
Specialization that implements interactive-on-error style
behavior.
"""
def addFailure(self, test, err):
super(InteractiveFailureResult, self).addFailure(test, err)
log.error(self._exc_info_to_string(err, test))
log.error("Failure in test '{0}', going interactive".format(
self.getDescription(test)
))
interactive.task(ctx=None, config=None)
def addError(self, test, err):
super(InteractiveFailureResult, self).addError(test, err)
log.error(self._exc_info_to_string(err, test))
log.error("Error in test '{0}', going interactive".format(
self.getDescription(test)
))
interactive.task(ctx=None, config=None)
def exec_test():
# Help developers by stopping up-front if their tree isn't built enough for all the
# tools that the tests might want to use (add more here if needed)
require_binaries = ["ceph-dencoder", "cephfs-journal-tool", "cephfs-data-scan",
"cephfs-table-tool", "ceph-fuse", "rados"]
missing_binaries = [b for b in require_binaries if not os.path.exists(os.path.join(BIN_PREFIX, b))]
if missing_binaries:
log.error("Some ceph binaries missing, please build them: {0}".format(" ".join(missing_binaries)))
sys.exit(-1)
test_dir = tempfile.mkdtemp()
# Run with two clients because some tests require the second one
clients = ["0", "1"]
remote = LocalRemote()
# Tolerate no MDSs or clients running at start
ps_txt = remote.run(
args=["ps", "aux"]
).stdout.getvalue().strip()
lines = ps_txt.split("\n")[1:]
for line in lines:
if 'ceph-fuse' in line or 'ceph-mds' in line:
pid = int(line.split()[1])
log.warn("Killing stray process {0}".format(line))
os.kill(pid, signal.SIGKILL)
class LocalCluster(object):
def __init__(self, rolename="placeholder"):
self.remotes = {
remote: [rolename]
}
def only(self, requested):
return self.__class__(rolename=requested)
class LocalContext(object):
def __init__(self):
self.config = {}
self.teuthology_config = {
'test_path': test_dir
}
self.cluster = LocalCluster()
def __del__(self):
shutil.rmtree(self.teuthology_config['test_path'])
ctx = LocalContext()
mounts = []
for client_id in clients:
# Populate client keyring (it sucks to use client.admin for test clients
# because it's awkward to find the logs later)
client_name = "client.{0}".format(client_id)
if client_name not in open("./keyring").read():
p = remote.run(args=["./ceph", "auth", "get-or-create", client_name,
"osd", "allow rw",
"mds", "allow",
"mon", "allow r"])
open("./keyring", "a").write(p.stdout.getvalue())
mount_point = os.path.join(test_dir, "mnt.{0}".format(client_id))
mount = LocalFuseMount(client_id, mount_point)
mounts.append(mount)
if mount.is_mounted():
log.warn("unmounting {0}".format(mount_point))
mount.umount_wait()
else:
if os.path.exists(mount_point):
os.rmdir(mount_point)
filesystem = LocalFilesystem(ctx)
from tasks.cephfs_test_runner import DecoratingLoader
class LogStream(object):
def __init__(self):
self.buffer = ""
def write(self, data):
self.buffer += data
if "\n" in self.buffer:
lines = self.buffer.split("\n")
for line in lines[:-1]:
pass
# sys.stderr.write(line + "\n")
log.info(line)
self.buffer = lines[-1]
def flush(self):
pass
decorating_loader = DecoratingLoader({
"ctx": ctx,
"mounts": mounts,
"fs": filesystem
})
# For the benefit of polling tests like test_full -- in teuthology land we set this
# in a .yaml, here it's just a hardcoded thing for the developer's pleasure.
remote.run(args=["./ceph", "tell", "osd.*", "injectargs", "--osd-mon-report-interval-max", "5"])
filesystem.set_ceph_conf("osd", "osd_mon_report_interval_max", "5")
# Vstart defaults to two segments, which very easily gets a "behind on trimming" health warning
# from normal IO latency. Increase it for running teests.
filesystem.set_ceph_conf("mds", "mds log max segments", "10")
# Make sure the filesystem created in tests has uid/gid that will let us talk to
# it after mounting it (without having to go root). Set in 'global' not just 'mds'
# so that cephfs-data-scan will pick it up too.
filesystem.set_ceph_conf("global", "mds root ino uid", "%s" % os.getuid())
filesystem.set_ceph_conf("global", "mds root ino gid", "%s" % os.getgid())
# Monkeypatch get_package_version to avoid having to work out what kind of distro we're on
def _get_package_version(remote, pkg_name):
# Used in cephfs tests to find fuse version. Your development workstation *does* have >=2.9, right?
return "2.9"
import teuthology.packaging
teuthology.packaging.get_package_version = _get_package_version
def enumerate_methods(s):
for t in s._tests:
if isinstance(t, suite.BaseTestSuite):
for sub in enumerate_methods(t):
yield sub
else:
yield s, t
interactive_on_error = False
args = sys.argv[1:]
flags = [a for a in args if a.startswith("-")]
modules = [a for a in args if not a.startswith("-")]
for f in flags:
if f == "--interactive":
interactive_on_error = True
else:
log.error("Unknown option '{0}'".format(f))
sys.exit(-1)
if modules:
log.info("Executing modules: {0}".format(modules))
module_suites = []
for mod_name in modules:
# Test names like cephfs.test_auto_repair
log.info("Loaded: {0}".format(list(module_suites)))
module_suites.append(decorating_loader.loadTestsFromName(mod_name))
overall_suite = suite.TestSuite(module_suites)
else:
log.info("Excuting all tests")
overall_suite = decorating_loader.discover(
os.path.dirname(os.path.abspath(__file__))
)
# Filter out tests that don't lend themselves to interactive running,
victims = []
for case, method in enumerate_methods(overall_suite):
fn = getattr(method, method._testMethodName)
drop_test = False
if hasattr(fn, 'is_long_running') and getattr(fn, 'is_long_running') is True:
drop_test = True
log.warn("Dropping test because long running: ".format(method.id()))
if getattr(fn, "needs_trimming", False) is True:
drop_test = (os.getuid() != 0)
log.warn("Dropping test because client trim unavailable: ".format(method.id()))
if drop_test:
# Don't drop the test if it was explicitly requested in arguments
is_named = False
for named in modules:
if named.endswith(method.id()):
is_named = True
break
if not is_named:
victims.append((case, method))
log.info("Disabling {0} tests because of is_long_running or needs_trimming".format(len(victims)))
for s, method in victims:
s._tests.remove(method)
if interactive_on_error:
result_class = InteractiveFailureResult
else:
result_class = unittest.TextTestResult
fail_on_skip = False
class LoggingResult(result_class):
def startTest(self, test):
log.info("Starting test: {0}".format(self.getDescription(test)))
test.started_at = datetime.datetime.utcnow()
return super(LoggingResult, self).startTest(test)
def stopTest(self, test):
log.info("Stopped test: {0} in {1}s".format(
self.getDescription(test),
(datetime.datetime.utcnow() - test.started_at).total_seconds()
))
def addSkip(self, test, reason):
if fail_on_skip:
# Don't just call addFailure because that requires a traceback
self.failures.append((test, reason))
else:
super(LoggingResult, self).addSkip(test, reason)
# Execute!
result = unittest.TextTestRunner(
stream=LogStream(),
resultclass=LoggingResult,
verbosity=2,
failfast=True).run(overall_suite)
if not result.wasSuccessful():
result.printErrors() # duplicate output at end for convenience
bad_tests = []
for test, error in result.errors:
bad_tests.append(str(test))
for test, failure in result.failures:
bad_tests.append(str(test))
sys.exit(-1)
else:
sys.exit(0)
if __name__ == "__main__":
exec_test()