ceph/teuthology/suite.py

672 lines
20 KiB
Python
Raw Normal View History

# this file is responsible for submitting tests into the queue
# by generating combinations of facets found in
# https://github.com/ceph/ceph-qa-suite.git
import argparse
import copy
import errno
import itertools
import logging
import os
2013-08-27 19:51:27 +00:00
import re
import subprocess
import sys
from textwrap import dedent, fill
import time
import yaml
2013-09-11 15:08:18 +00:00
from teuthology import misc
from teuthology import lock as lock
log = logging.getLogger(__name__)
def main():
parser = argparse.ArgumentParser(description="""
Run a suite of ceph integration tests.
A suite is a set of collections.
A collection is a directory containing facets.
A facet is a directory containing config snippets.
Running a collection means running teuthology for every configuration
combination generated by taking one config snippet from each facet.
Any config files passed on the command line will be used for every
combination, and will override anything in the suite.
""")
parser.add_argument(
'-v', '--verbose',
action='store_true', default=None,
help='be more verbose',
)
parser.add_argument(
'--dry-run',
action='store_true', default=None,
help='do a dry run; do not schedule anything',
)
parser.add_argument(
'--name',
help='name for this suite',
required=True,
)
parser.add_argument(
'--base',
default=None,
help='base directory for the collection(s)'
)
parser.add_argument(
'--collections',
metavar='DIR',
nargs='+',
required=True,
help='the collections to run',
)
parser.add_argument(
'--owner',
help='job owner',
)
parser.add_argument(
'--email',
help='address to email test failures to',
)
parser.add_argument(
'--timeout',
help='how many seconds to wait for jobs to finish before emailing results',
)
parser.add_argument(
'-n', '--num',
default=1,
type=int,
help='number of times to run/queue each job'
)
parser.add_argument(
'-w', '--worker',
default='plana',
help='which worker to use (type of machine)',
)
parser.add_argument(
'config',
metavar='CONFFILE',
nargs='*',
default=[],
help='config file to read',
)
args = parser.parse_args()
loglevel = logging.INFO
if args.verbose:
loglevel = logging.DEBUG
logging.basicConfig(
level=loglevel,
)
base_arg = [
os.path.join(os.path.dirname(sys.argv[0]), 'teuthology-schedule'),
'--name', args.name,
'--num', str(args.num),
'--worker', args.worker,
]
if args.verbose:
base_arg.append('-v')
if args.owner:
base_arg.extend(['--owner', args.owner])
collections = [
(os.path.join(args.base, collection), collection)
for collection in args.collections
]
num_jobs = 0
for collection, collection_name in sorted(collections):
log.debug('Collection %s in %s' % (collection_name, collection))
configs = [(combine_path(collection_name, item[0]), item[1]) for item in build_matrix(collection)]
log.info('Collection %s in %s generated %d jobs' % (collection_name, collection, len(configs)))
num_jobs += len(configs)
arch = get_arch(args.config)
machine_type = get_machine_type(args.config)
for description, config in configs:
raw_yaml = '\n'.join([file(a, 'r').read() for a in config])
parsed_yaml = yaml.load(raw_yaml)
os_type = parsed_yaml.get('os_type')
exclude_arch = parsed_yaml.get('exclude_arch')
exclude_os_type = parsed_yaml.get('exclude_os_type')
if exclude_arch:
if exclude_arch == arch:
log.info(
'Skipping due to excluded_arch: %s facets %s', exclude_arch, description
)
continue
if exclude_os_type:
if exclude_os_type == os_type:
log.info(
'Skipping due to excluded_os_type: %s facets %s', exclude_os_type, description
)
continue
# We should not run multiple tests (changing distros) unless the machine is a VPS
# Re-imaging baremetal is not yet supported.
if machine_type != 'vps':
if os_type and os_type != 'ubuntu':
log.info(
'Skipping due to non-ubuntu on baremetal facets %s', description
)
continue
log.info(
'Scheduling %s', description
)
arg = copy.deepcopy(base_arg)
arg.extend([
'--description', description,
'--',
])
arg.extend(args.config)
arg.extend(config)
if args.dry_run:
log.info('dry-run: %s' % ' '.join(arg))
else:
subprocess.check_call(
args=arg,
)
if num_jobs:
arg = copy.deepcopy(base_arg)
arg.append('--last-in-suite')
if args.email:
arg.extend(['--email', args.email])
if args.timeout:
arg.extend(['--timeout', args.timeout])
if args.dry_run:
log.info('dry-run: %s' % ' '.join(arg))
else:
subprocess.check_call(
args=arg,
)
def combine_path(left, right):
"""
os.path.join(a, b) doesn't like it when b is None
"""
if right:
return os.path.join(left, right)
return left
def build_matrix(path):
"""
Return a list of items describe by path
The input is just a path. The output is an array of (description,
[file list]) tuples.
For a normal file we generate a new item for the result list.
For a directory, we (recursively) generate a new item for each
file/dir.
For a directory with a magic '+' file, we generate a single item
that concatenates all files/subdirs.
For a directory with a magic '%' file, we generate a result set
for each tiem in the directory, and then do a product to generate
a result list with all combinations.
The final description (after recursion) for each item will look
like a relative path. If there was a % product, that path
component will appear as a file with braces listing the selection
of chosen subitems.
"""
if os.path.isfile(path):
if path.endswith('.yaml'):
return [(None, [path])]
if os.path.isdir(path):
files = sorted(os.listdir(path))
if '+' in files:
# concatenate items
files.remove('+')
out = []
for fn in files:
out.extend(build_matrix(os.path.join(path, fn)))
return [(
'+',
[a[1] for a in out]
)]
elif '%' in files:
# convolve items
files.remove('%')
sublists = []
for fn in files:
raw = build_matrix(os.path.join(path, fn))
sublists.append([(combine_path(fn, item[0]), item[1]) for item in raw])
out = []
if sublists:
for sublist in itertools.product(*sublists):
name = '{' + ' '.join([item[0] for item in sublist]) + '}'
val = []
for item in sublist:
val.extend(item[1])
out.append((name, val))
return out
else:
# list items
out = []
for fn in files:
raw = build_matrix(os.path.join(path, fn))
out.extend([(combine_path(fn, item[0]), item[1]) for item in raw])
return out
return []
def ls():
parser = argparse.ArgumentParser(description='List teuthology job results')
parser.add_argument(
'--archive-dir',
metavar='DIR',
help='path under which to archive results',
required=True,
)
parser.add_argument(
'-v', '--verbose',
action='store_true', default=False,
help='show reasons tests failed',
)
args = parser.parse_args()
2013-08-27 19:51:27 +00:00
for j in get_jobs(args.archive_dir):
job_dir = os.path.join(args.archive_dir, j)
summary = {}
try:
with file(os.path.join(job_dir, 'summary.yaml')) as f:
g = yaml.safe_load_all(f)
for new in g:
2011-06-30 18:25:15 +00:00
summary.update(new)
except IOError, e:
if e.errno == errno.ENOENT:
2012-04-10 15:57:19 +00:00
print '%s ' % j,
# pid
try:
pidfile = os.path.join(job_dir, 'pid')
2012-01-16 21:18:49 +00:00
found = False
if os.path.isfile(pidfile):
pid = open(pidfile, 'r').read()
if os.path.isdir("/proc/%s" % pid):
cmdline = open('/proc/%s/cmdline' % pid, 'r').read()
if cmdline.find(args.archive_dir) >= 0:
2012-01-16 21:18:49 +00:00
print '(pid %s)' % pid,
found = True
if not found:
2012-04-10 15:59:47 +00:00
print '(no process or summary.yaml)',
# tail
tail = os.popen(
'tail -1 %s/%s/teuthology.log' % (args.archive_dir, j)
).read().rstrip()
2012-01-16 21:18:49 +00:00
print tail,
except IOError, e:
continue
print ''
continue
else:
raise
2012-01-16 21:18:49 +00:00
print "{job} {success} {owner} {desc} {duration}s".format(
job=j,
2011-07-01 16:34:08 +00:00
owner=summary.get('owner', '-'),
desc=summary.get('description', '-'),
success='pass' if summary.get('success', False) else 'FAIL',
duration=int(summary.get('duration', 0)),
)
if args.verbose and 'failure_reason' in summary:
print ' {reason}'.format(reason=summary['failure_reason'])
def generate_coverage(args):
log.info('starting coverage generation')
subprocess.Popen(
args=[
os.path.join(os.path.dirname(sys.argv[0]), 'teuthology-coverage'),
'-v',
'-o',
os.path.join(args.teuthology_config['coverage_output_dir'], args.name),
'--html-output',
os.path.join(args.teuthology_config['coverage_html_dir'], args.name),
'--cov-tools-dir',
args.teuthology_config['coverage_tools_dir'],
args.archive_dir,
],
)
def email_results(subject, from_, to, body):
log.info('Sending results to {to}: {body}'.format(to=to, body=body))
import smtplib
from email.mime.text import MIMEText
msg = MIMEText(body)
msg['Subject'] = subject
msg['From'] = from_
msg['To'] = to
log.debug('sending email %s', msg.as_string())
smtp = smtplib.SMTP('localhost')
smtp.sendmail(msg['From'], [msg['To']], msg.as_string())
smtp.quit()
def results():
parser = argparse.ArgumentParser(description='Email teuthology suite results')
parser.add_argument(
'--email',
help='address to email test failures to',
)
parser.add_argument(
'--timeout',
help='how many seconds to wait for all tests to finish (default no wait)',
type=int,
default=0,
)
parser.add_argument(
'--archive-dir',
metavar='DIR',
help='path under which results for the suite are stored',
required=True,
)
parser.add_argument(
'--name',
help='name of the suite',
required=True,
)
parser.add_argument(
'-v', '--verbose',
action='store_true', default=False,
help='be more verbose',
)
args = parser.parse_args()
loglevel = logging.INFO
if args.verbose:
loglevel = logging.DEBUG
logging.basicConfig(
level=loglevel,
)
2013-09-11 15:08:18 +00:00
misc.read_config(args)
handler = logging.FileHandler(
filename=os.path.join(args.archive_dir, 'results.log'),
)
formatter = logging.Formatter(
fmt='%(asctime)s.%(msecs)03d %(levelname)s:%(message)s',
datefmt='%Y-%m-%dT%H:%M:%S',
)
handler.setFormatter(formatter)
logging.getLogger().addHandler(handler)
try:
_results(args)
except Exception:
log.exception('error generating results')
raise
2013-08-26 16:51:43 +00:00
def _results(args):
running_tests = [
f for f in sorted(os.listdir(args.archive_dir))
if not f.startswith('.')
and os.path.isdir(os.path.join(args.archive_dir, f))
and not os.path.exists(os.path.join(args.archive_dir, f, 'summary.yaml'))
2013-08-26 16:51:43 +00:00
]
starttime = time.time()
2012-03-19 18:31:33 +00:00
log.info('Waiting up to %d seconds for tests to finish...', args.timeout)
while running_tests and args.timeout > 0:
if os.path.exists(os.path.join(
args.archive_dir,
running_tests[-1], 'summary.yaml')):
running_tests.pop()
else:
if time.time() - starttime > args.timeout:
log.warn('test(s) did not finish before timeout of %d seconds',
args.timeout)
break
time.sleep(10)
2012-03-19 18:31:33 +00:00
log.info('Tests finished! gathering results...')
2013-08-26 16:51:43 +00:00
(subject, body) = build_email_body(args.name, args.archive_dir,
args.timeout)
try:
if args.email:
email_results(
subject=subject,
from_=args.teuthology_config['results_sending_email'],
to=args.email,
body=body,
2013-08-26 16:51:43 +00:00
)
finally:
generate_coverage(args)
2013-08-26 16:51:43 +00:00
2013-08-27 19:51:27 +00:00
def get_jobs(archive_dir):
dir_contents = os.listdir(archive_dir)
def is_job_dir(parent, subdir):
if os.path.isdir(os.path.join(parent, subdir)) and re.match('\d+$', subdir):
return True
return False
jobs = [job for job in dir_contents if is_job_dir(archive_dir, job)]
return sorted(jobs)
2013-08-27 22:22:27 +00:00
email_templates = {
'body_templ': dedent("""\
2013-08-28 20:41:14 +00:00
Test Run: {name}
2013-08-27 22:22:27 +00:00
=================================================================
logs: {log_root}
failed: {fail_count}
hung: {hung_count}
passed: {pass_count}
{fail_sect}{hung_sect}{pass_sect}
"""),
'sect_templ': dedent("""\
{title}
=================================================================
{jobs}
"""),
'fail_templ': dedent("""\
[{job_id}] {desc}
-----------------------------------------------------------------
2013-08-27 22:26:46 +00:00
time: {time}s{log_line}{sentry_line}
2013-08-27 22:22:27 +00:00
{reason}
"""),
'fail_log_templ': "\nlog: {log}",
'fail_sentry_templ': "\nsentry: {sentries}",
'hung_templ': dedent("""\
2013-08-29 22:08:51 +00:00
[{job_id}] {desc}
2013-08-27 22:22:27 +00:00
"""),
'pass_templ': dedent("""\
[{job_id}] {desc}
2013-08-27 22:26:46 +00:00
time: {time}s
2013-08-27 22:22:27 +00:00
"""),
}
2013-08-26 16:51:43 +00:00
def build_email_body(name, archive_dir, timeout):
2013-08-27 22:22:27 +00:00
failed = {}
hung = {}
passed = {}
2013-08-26 16:51:43 +00:00
2013-08-27 19:51:27 +00:00
for job in get_jobs(archive_dir):
job_dir = os.path.join(archive_dir, job)
2013-08-26 16:51:43 +00:00
summary_file = os.path.join(job_dir, 'summary.yaml')
2013-08-27 19:51:27 +00:00
2013-08-26 16:51:43 +00:00
# Unfinished jobs will have no summary.yaml
if not os.path.exists(summary_file):
2013-08-29 22:08:51 +00:00
info_file = os.path.join(job_dir, 'info.yaml')
desc = ''
if os.path.exists(info_file):
with file(info_file) as f:
info = yaml.safe_load(f)
desc = info['description']
hung[job] = email_templates['hung_templ'].format(
job_id=job,
desc=desc,
)
continue
2013-08-26 16:51:43 +00:00
with file(summary_file) as f:
summary = yaml.safe_load(f)
2013-08-27 22:22:27 +00:00
if summary['success']:
2013-08-27 22:22:27 +00:00
passed[job] = email_templates['pass_templ'].format(
job_id=job,
desc=summary.get('description'),
2013-09-19 03:57:20 +00:00
time=int(summary.get('duration', 0)),
2013-08-27 22:22:27 +00:00
)
else:
2013-09-11 15:08:18 +00:00
log = misc.get_http_log_path(archive_dir, job)
2013-08-27 22:22:27 +00:00
if log:
log_line = email_templates['fail_log_templ'].format(log=log)
else:
log_line = ''
2013-08-26 22:21:48 +00:00
sentry_events = summary.get('sentry_events')
if sentry_events:
2013-08-27 22:22:27 +00:00
sentry_line = email_templates['fail_sentry_templ'].format(
sentries='\n '.join(sentry_events))
else:
sentry_line = ''
2013-08-28 16:12:10 +00:00
# 'fill' is from the textwrap module and it collapses a given
# string into multiple lines of a maximum width as specified. We
# want 75 characters here so that when we indent by 4 on the next
# line, we have 79-character exception paragraphs.
reason = fill(summary.get('failure_reason'), 75)
reason = '\n'.join((' ') + line for line in reason.splitlines())
2013-08-27 22:22:27 +00:00
failed[job] = email_templates['fail_templ'].format(
job_id=job,
desc=summary.get('description'),
time=int(summary.get('duration', 0)),
2013-08-28 16:12:10 +00:00
reason=reason,
2013-08-27 22:22:27 +00:00
log_line=log_line,
sentry_line=sentry_line,
)
2013-08-26 16:51:43 +00:00
maybe_comma = lambda s: ', ' if s else ' '
subject = ''
2013-08-27 22:22:27 +00:00
fail_sect = ''
hung_sect = ''
pass_sect = ''
2013-08-26 16:51:43 +00:00
if failed:
subject += '{num_failed} failed{sep}'.format(
num_failed=len(failed),
2013-08-27 22:22:27 +00:00
sep=maybe_comma(hung or passed)
)
fail_sect = email_templates['sect_templ'].format(
title='Failed',
jobs=''.join(failed.values())
2013-08-26 16:51:43 +00:00
)
2013-08-27 22:22:27 +00:00
if hung:
2013-08-26 16:51:43 +00:00
subject += '{num_hung} hung{sep}'.format(
2013-08-27 22:22:27 +00:00
num_hung=len(hung),
sep=maybe_comma(passed),
2013-08-26 16:51:43 +00:00
)
2013-08-27 22:22:27 +00:00
hung_sect = email_templates['sect_templ'].format(
title='Hung',
jobs=''.join(hung.values()),
2013-08-26 16:51:43 +00:00
)
if passed:
subject += '%s passed ' % len(passed)
2013-08-27 22:22:27 +00:00
pass_sect = email_templates['sect_templ'].format(
title='Passed',
jobs=''.join(passed.values()),
)
body = email_templates['body_templ'].format(
2013-08-28 20:41:14 +00:00
name=name,
log_root=misc.get_http_log_path(archive_dir),
2013-08-27 22:22:27 +00:00
fail_count=len(failed),
hung_count=len(hung),
pass_count=len(passed),
fail_sect=fail_sect,
hung_sect=hung_sect,
pass_sect=pass_sect,
)
2013-08-26 16:51:43 +00:00
subject += 'in {suite}'.format(suite=name)
return (subject.strip(), body.strip())
def get_arch(config):
for yamlfile in config:
y = yaml.safe_load(file(yamlfile))
machine_type = y.get('machine_type')
if machine_type:
fakectx = []
locks = lock.list_locks(fakectx)
for machine in locks:
if machine['type'] == machine_type:
arch = machine['arch']
return arch
return None
2013-08-26 16:51:43 +00:00
def get_os_type(configs):
for config in configs:
yamlfile = config[2]
y = yaml.safe_load(file(yamlfile))
if not y:
y = {}
os_type = y.get('os_type')
if os_type:
return os_type
return None
2013-08-26 16:51:43 +00:00
def get_exclude_arch(configs):
for config in configs:
yamlfile = config[2]
y = yaml.safe_load(file(yamlfile))
if not y:
y = {}
exclude_arch = y.get('exclude_arch')
if exclude_arch:
return exclude_arch
return None
2013-08-26 16:51:43 +00:00
def get_exclude_os_type(configs):
for config in configs:
yamlfile = config[2]
y = yaml.safe_load(file(yamlfile))
if not y:
y = {}
exclude_os_type = y.get('exclude_os_type')
if exclude_os_type:
return exclude_os_type
return None
2013-08-26 16:51:43 +00:00
def get_machine_type(config):
for yamlfile in config:
y = yaml.safe_load(file(yamlfile))
if not y:
y = {}
machine_type = y.get('machine_type')
if machine_type:
return machine_type
return None