ceph/teuthology/run.py
Tommi Virtanen 99ac6b0b3e Disable asynchronous DNS lookups.
Especially on older hosts, we keep triggering errors::

  ServerNotFoundError: Unable to find the server at
  teuthology.front.sepia.ceph.com: [Errno 3] name does not exist

That comes from libevent's evdns via gevent.dns and httplib2. The rate
of these errors is low enough that they seem to be perhaps timeouts,
or more arbitrary. Busy looping on DNS resolution calls has never
triggered them, so far.

With ``monkey.patch_all(dns=False)``, the teuthology process will
block as a whole whenever doing DNS resolution. This will hopefully be
rare enough that it won't matter.

The only real "fix" seems to be upgrading libraries and hoping for the
best; this commit can be reverted after that is done.
2012-08-13 16:18:33 -07:00

274 lines
7.9 KiB
Python

import argparse
import os
import yaml
def config_file(string):
config = {}
try:
with file(string) as f:
g = yaml.safe_load_all(f)
for new in g:
config.update(new)
except IOError, e:
raise argparse.ArgumentTypeError(str(e))
return config
class MergeConfig(argparse.Action):
def __call__(self, parser, namespace, values, option_string=None):
config = getattr(namespace, self.dest)
from teuthology.misc import deep_merge
for new in values:
deep_merge(config, new)
def parse_args():
parser = argparse.ArgumentParser(description='Run ceph integration tests')
parser.add_argument(
'-v', '--verbose',
action='store_true', default=None,
help='be more verbose',
)
parser.add_argument(
'config',
metavar='CONFFILE',
nargs='+',
type=config_file,
action=MergeConfig,
default={},
help='config file to read',
)
parser.add_argument(
'-a', '--archive',
metavar='DIR',
help='path to archive results in',
)
parser.add_argument(
'--description',
help='job description',
)
parser.add_argument(
'--owner',
help='job owner',
)
parser.add_argument(
'--lock',
action='store_true',
default=False,
help='lock machines for the duration of the run',
)
parser.add_argument(
'--block',
action='store_true',
default=False,
help='block until locking machines succeeds (use with --lock)',
)
args = parser.parse_args()
return args
def main():
from gevent import monkey; monkey.patch_all(dns=False)
from .orchestra import monkey; monkey.patch_all()
import logging
log = logging.getLogger(__name__)
ctx = parse_args()
loglevel = logging.INFO
if ctx.verbose:
loglevel = logging.DEBUG
logging.basicConfig(
level=loglevel,
)
if 'targets' in ctx.config and 'roles' in ctx.config:
targets = len(ctx.config['targets'])
roles = len(ctx.config['roles'])
assert targets >= roles, \
'%d targets are needed for all roles but found %d listed.' % (roles, targets)
if ctx.block:
assert ctx.lock, \
'the --block option is only supported with the --lock option'
from teuthology.misc import read_config
read_config(ctx)
log.debug('\n '.join(['Config:', ] + yaml.safe_dump(ctx.config, default_flow_style=False).splitlines()))
ctx.summary = dict(success=True)
if ctx.owner is None:
from teuthology.misc import get_user
ctx.owner = get_user()
ctx.summary['owner'] = ctx.owner
if ctx.description is not None:
ctx.summary['description'] = ctx.description
if ctx.archive is not None:
os.mkdir(ctx.archive)
handler = logging.FileHandler(
filename=os.path.join(ctx.archive, 'teuthology.log'),
)
formatter = logging.Formatter(
fmt='%(asctime)s.%(msecs)03d %(levelname)s:%(name)s:%(message)s',
datefmt='%Y-%m-%dT%H:%M:%S',
)
handler.setFormatter(formatter)
logging.getLogger().addHandler(handler)
with file(os.path.join(ctx.archive, 'pid'), 'w') as f:
f.write('%d' % os.getpid())
with file(os.path.join(ctx.archive, 'owner'), 'w') as f:
f.write(ctx.owner + '\n')
for task in ctx.config['tasks']:
assert 'kernel' not in task, \
'kernel installation shouldn be a base-level item, not part of the tasks list'
init_tasks = []
if ctx.lock:
assert 'targets' not in ctx.config, \
'You cannot specify targets in a config file when using the --lock option'
init_tasks.append({'internal.lock_machines': len(ctx.config['roles'])})
init_tasks.extend([
{'internal.save_config': None},
{'internal.check_lock': None},
{'internal.connect': None},
{'internal.check_conflict': None},
])
if 'kernel' in ctx.config:
init_tasks.append({'kernel': ctx.config['kernel']})
init_tasks.extend([
{'internal.base': None},
{'internal.archive': None},
{'internal.coredump': None},
{'internal.syslog': None},
{'internal.timer': None},
])
ctx.config['tasks'][:0] = init_tasks
from teuthology.run_tasks import run_tasks
try:
run_tasks(tasks=ctx.config['tasks'], ctx=ctx)
finally:
if not ctx.summary.get('success') and ctx.config.get('nuke-on-error'):
from teuthology.nuke import nuke
# only unlock if we locked them in the first place
nuke(ctx, log, ctx.lock)
if ctx.archive is not None:
with file(os.path.join(ctx.archive, 'summary.yaml'), 'w') as f:
yaml.safe_dump(ctx.summary, f, default_flow_style=False)
if not ctx.summary.get('success', True):
import sys
sys.exit(1)
def schedule():
parser = argparse.ArgumentParser(description='Schedule ceph integration tests')
parser.add_argument(
'config',
metavar='CONFFILE',
nargs='*',
type=config_file,
action=MergeConfig,
default={},
help='config file to read',
)
parser.add_argument(
'--name',
help='name of suite run the job is part of',
)
parser.add_argument(
'--last-in-suite',
action='store_true',
default=False,
help='mark the last job in a suite so suite post-processing can be run',
)
parser.add_argument(
'--email',
help='where to send the results of a suite (only applies to the last job in a suite)',
)
parser.add_argument(
'--timeout',
help='how many seconds to wait for jobs to finish before emailing results (only applies to the last job in a suite',
type=int,
)
parser.add_argument(
'--description',
help='job description',
)
parser.add_argument(
'--owner',
help='job owner',
)
parser.add_argument(
'--delete',
metavar='JOBID',
type=int,
nargs='*',
help='list of jobs to remove from the queue',
)
parser.add_argument(
'-n', '--num',
default=1,
type=int,
help='number of times to run/queue the job'
)
parser.add_argument(
'-v', '--verbose',
action='store_true',
default=False,
help='be more verbose',
)
ctx = parser.parse_args()
if not ctx.last_in_suite:
assert not ctx.email, '--email is only applicable to the last job in a suite'
assert not ctx.timeout, '--timeout is only applicable to the last job in a suite'
from teuthology.misc import read_config, get_user
if ctx.owner is None:
ctx.owner = 'scheduled_{user}'.format(user=get_user())
read_config(ctx)
import teuthology.queue
beanstalk = teuthology.queue.connect(ctx)
beanstalk.use('teuthology')
if ctx.delete:
for jobid in ctx.delete:
job = beanstalk.peek(jobid)
if job is None:
print 'job {jid} is not in the queue'.format(jid=jobid)
else:
job.delete()
return
job_config = dict(
config=ctx.config,
name=ctx.name,
last_in_suite=ctx.last_in_suite,
email=ctx.email,
description=ctx.description,
owner=ctx.owner,
verbose=ctx.verbose,
)
if ctx.timeout is not None:
job_config['results_timeout'] = ctx.timeout
job = yaml.safe_dump(job_config)
num = ctx.num
while num > 0:
jid = beanstalk.put(job, ttr=60*60*24)
print 'Job scheduled with ID {jid}'.format(jid=jid)
num -= 1