ceph/teuthology/task/ceph-deploy.py
2013-09-24 15:04:39 -05:00

437 lines
15 KiB
Python

from cStringIO import StringIO
import contextlib
import os
import time
import logging
from teuthology import misc as teuthology
from teuthology import contextutil
from ..config import config as teuth_config
import ceph as ceph_fn
from ..orchestra import run
log = logging.getLogger(__name__)
@contextlib.contextmanager
def download_ceph_deploy(ctx, config):
"""
Downloads ceph-deploy from the ceph.com git mirror and (by default)
switches to the master branch. If the `ceph-deploy-branch` is specified, it
will use that instead.
"""
log.info('Downloading ceph-deploy...')
testdir = teuthology.get_testdir(ctx)
ceph_admin = teuthology.get_first_mon(ctx, config)
default_cd_branch = {'ceph-deploy-branch': 'master'}
ceph_deploy_branch = config.get(
'ceph-deploy',
default_cd_branch).get('ceph-deploy-branch')
ctx.cluster.only(ceph_admin).run(
args=[
'git', 'clone', '-b', ceph_deploy_branch,
teuth_config.ceph_git_base_url + 'ceph-deploy.git',
'{tdir}/ceph-deploy'.format(tdir=testdir),
],
)
ctx.cluster.only(ceph_admin).run(
args=[
'cd',
'{tdir}/ceph-deploy'.format(tdir=testdir),
run.Raw('&&'),
'./bootstrap',
],
)
try:
yield
finally:
log.info('Removing ceph-deploy ...')
ctx.cluster.only(ceph_admin).run(
args=[
'rm',
'-rf',
'{tdir}/ceph-deploy'.format(tdir=testdir),
],
)
def is_healthy(ctx, config):
"""Wait until a Ceph cluster is healthy."""
testdir = teuthology.get_testdir(ctx)
ceph_admin = teuthology.get_first_mon(ctx, config)
(remote,) = ctx.cluster.only(ceph_admin).remotes.keys()
while True:
r = remote.run(
args=[
'cd',
'{tdir}'.format(tdir=testdir),
run.Raw('&&'),
'sudo', 'ceph',
'health',
],
stdout=StringIO(),
logger=log.getChild('health'),
)
out = r.stdout.getvalue()
log.debug('Ceph health: %s', out.rstrip('\n'))
if out.split(None, 1)[0] == 'HEALTH_OK':
break
time.sleep(1)
def get_nodes_using_roles(ctx, config, role):
newl = []
for _remote, roles_for_host in ctx.cluster.remotes.iteritems():
for id_ in teuthology.roles_of_type(roles_for_host, role):
rem = _remote
if role == 'mon':
req1 = str(rem).split('@')[-1]
else:
req = str(rem).split('.')[0]
req1 = str(req).split('@')[1]
newl.append(req1)
return newl
def get_dev_for_osd(ctx, config):
osd_devs = []
for remote, roles_for_host in ctx.cluster.remotes.iteritems():
host = remote.name.split('@')[-1]
shortname = host.split('.')[0]
devs = teuthology.get_scratch_devices(remote)
num_osd_per_host = list(teuthology.roles_of_type(roles_for_host, 'osd'))
num_osds = len(num_osd_per_host)
assert num_osds <= len(devs), 'fewer disks than osds on ' + shortname
for dev in devs[:num_osds]:
dev_short = dev.split('/')[-1]
osd_devs.append('{host}:{dev}'.format(host=shortname, dev=dev_short))
return osd_devs
def get_all_nodes(ctx, config):
nodelist = []
for t, k in ctx.config['targets'].iteritems():
host = t.split('@')[-1]
simple_host = host.split('.')[0]
nodelist.append(simple_host)
nodelist = " ".join(nodelist)
return nodelist
def execute_ceph_deploy(ctx, config, cmd):
testdir = teuthology.get_testdir(ctx)
ceph_admin = teuthology.get_first_mon(ctx, config)
exec_cmd = cmd
(remote,) = ctx.cluster.only(ceph_admin).remotes.iterkeys()
proc = remote.run(
args = [
'cd',
'{tdir}/ceph-deploy'.format(tdir=testdir),
run.Raw('&&'),
run.Raw(exec_cmd),
],
check_status=False,
)
exitstatus = proc.exitstatus
return exitstatus
@contextlib.contextmanager
def build_ceph_cluster(ctx, config):
log.info('Building ceph cluster using ceph-deploy...')
testdir = teuthology.get_testdir(ctx)
ceph_branch = None
if config.get('branch') is not None:
cbranch = config.get('branch')
for var, val in cbranch.iteritems():
if var == 'testing':
ceph_branch = '--{var}'.format(var=var)
ceph_branch = '--{var}={val}'.format(var=var, val=val)
node_dev_list = []
all_nodes = get_all_nodes(ctx, config)
mds_nodes = get_nodes_using_roles(ctx, config, 'mds')
mds_nodes = " ".join(mds_nodes)
mon_node = get_nodes_using_roles(ctx, config, 'mon')
mon_nodes = " ".join(mon_node)
new_mon = './ceph-deploy new'+" "+mon_nodes
install_nodes = './ceph-deploy install '+ceph_branch+" "+all_nodes
purge_nodes = './ceph-deploy purge'+" "+all_nodes
purgedata_nodes = './ceph-deploy purgedata'+" "+all_nodes
mon_hostname = mon_nodes.split(' ')[0]
mon_hostname = str(mon_hostname)
gather_keys = './ceph-deploy gatherkeys'+" "+mon_hostname
deploy_mds = './ceph-deploy mds create'+" "+mds_nodes
no_of_osds = 0
if mon_nodes is None:
raise RuntimeError("no monitor nodes in the config file")
estatus_new = execute_ceph_deploy(ctx, config, new_mon)
if estatus_new != 0:
raise RuntimeError("ceph-deploy: new command failed")
log.info('adding config inputs...')
testdir = teuthology.get_testdir(ctx)
conf_path = '{tdir}/ceph-deploy/ceph.conf'.format(tdir=testdir)
first_mon = teuthology.get_first_mon(ctx, config)
(remote,) = ctx.cluster.only(first_mon).remotes.keys()
lines = None
if config.get('conf') is not None:
confp = config.get('conf')
for section, keys in confp.iteritems():
lines = '[{section}]\n'.format(section=section)
teuthology.append_lines_to_file(remote, conf_path, lines, sudo=True)
for key, value in keys.iteritems():
log.info("[%s] %s = %s" % (section, key, value))
lines = '{key} = {value}\n'.format(key=key, value=value)
teuthology.append_lines_to_file(remote, conf_path, lines, sudo=True)
estatus_install = execute_ceph_deploy(ctx, config, install_nodes)
if estatus_install != 0:
raise RuntimeError("ceph-deploy: Failed to install ceph")
mon_no = None
mon_no = config.get('mon_initial_members')
if mon_no is not None:
i = 0
mon1 = []
while(i < mon_no):
mon1.append(mon_node[i])
i = i + 1
initial_mons = " ".join(mon1)
for k in range(mon_no, len(mon_node)):
mon_create_nodes = './ceph-deploy mon create'+" "+initial_mons+" "+mon_node[k]
estatus_mon = execute_ceph_deploy(ctx, config, mon_create_nodes)
if estatus_mon != 0:
raise RuntimeError("ceph-deploy: Failed to create monitor")
else:
mon_create_nodes = './ceph-deploy mon create'+" "+mon_nodes
estatus_mon = execute_ceph_deploy(ctx, config, mon_create_nodes)
if estatus_mon != 0:
raise RuntimeError("ceph-deploy: Failed to create monitors")
estatus_gather = execute_ceph_deploy(ctx, config, gather_keys)
while (estatus_gather != 0):
#mon_create_nodes = './ceph-deploy mon create'+" "+mon_node[0]
#execute_ceph_deploy(ctx, config, mon_create_nodes)
estatus_gather = execute_ceph_deploy(ctx, config, gather_keys)
if mds_nodes:
estatus_mds = execute_ceph_deploy(ctx, config, deploy_mds)
if estatus_mds != 0:
raise RuntimeError("ceph-deploy: Failed to deploy mds")
if config.get('test_mon_destroy') is not None:
for d in range(1, len(mon_node)):
mon_destroy_nodes = './ceph-deploy mon destroy'+" "+mon_node[d]
estatus_mon_d = execute_ceph_deploy(ctx, config, mon_destroy_nodes)
if estatus_mon_d != 0:
raise RuntimeError("ceph-deploy: Failed to delete monitor")
node_dev_list = get_dev_for_osd(ctx, config)
for d in node_dev_list:
osd_create_cmds = './ceph-deploy osd create --zap-disk'+" "+d
estatus_osd = execute_ceph_deploy(ctx, config, osd_create_cmds)
if estatus_osd == 0:
log.info('successfully created osd')
no_of_osds += 1
else:
zap_disk = './ceph-deploy disk zap'+" "+d
execute_ceph_deploy(ctx, config, zap_disk)
estatus_osd = execute_ceph_deploy(ctx, config, osd_create_cmds)
if estatus_osd == 0:
log.info('successfully created osd')
no_of_osds += 1
else:
raise RuntimeError("ceph-deploy: Failed to create osds")
if config.get('wait-for-healthy', True) and no_of_osds >= 2:
is_healthy(ctx=ctx, config=None)
log.info('Setting up client nodes...')
conf_path = '/etc/ceph/ceph.conf'
admin_keyring_path = '/etc/ceph/ceph.client.admin.keyring'
first_mon = teuthology.get_first_mon(ctx, config)
(mon0_remote,) = ctx.cluster.only(first_mon).remotes.keys()
conf_data = teuthology.get_file(
remote=mon0_remote,
path=conf_path,
sudo=True,
)
admin_keyring = teuthology.get_file(
remote=mon0_remote,
path=admin_keyring_path,
sudo=True,
)
clients = ctx.cluster.only(teuthology.is_type('client'))
for remot, roles_for_host in clients.remotes.iteritems():
for id_ in teuthology.roles_of_type(roles_for_host, 'client'):
client_keyring = '/etc/ceph/ceph.client.{id}.keyring'.format(id=id_)
mon0_remote.run(
args=[
'cd',
'{tdir}'.format(tdir=testdir),
run.Raw('&&'),
'sudo','bash','-c',
run.Raw('"'),'ceph',
'auth',
'get-or-create',
'client.{id}'.format(id=id_),
'mds', 'allow',
'mon', 'allow *',
'osd', 'allow *',
run.Raw('>'),
client_keyring,
run.Raw('"'),
],
)
key_data = teuthology.get_file(
remote=mon0_remote,
path=client_keyring,
sudo=True,
)
teuthology.sudo_write_file(
remote=remot,
path=client_keyring,
data=key_data,
perms='0644'
)
teuthology.sudo_write_file(
remote=remot,
path=admin_keyring_path,
data=admin_keyring,
perms='0644'
)
teuthology.sudo_write_file(
remote=remot,
path=conf_path,
data=conf_data,
perms='0644'
)
else:
raise RuntimeError("The cluster is NOT operational due to insufficient OSDs")
try:
yield
finally:
log.info('Stopping ceph...')
ctx.cluster.run(args=[
'sudo', 'stop', 'ceph-all',
run.Raw('||'),
'sudo', 'service', 'ceph', 'stop'
])
if ctx.archive is not None:
# archive mon data, too
log.info('Archiving mon data...')
path = os.path.join(ctx.archive, 'data')
os.makedirs(path)
mons = ctx.cluster.only(teuthology.is_type('mon'))
for remote, roles in mons.remotes.iteritems():
for role in roles:
if role.startswith('mon.'):
teuthology.pull_directory_tarball(
remote,
'/var/lib/ceph/mon',
path + '/' + role + '.tgz')
log.info('Compressing logs...')
run.wait(
ctx.cluster.run(
args=[
'sudo',
'find',
'/var/log/ceph',
'-name',
'*.log',
'-print0',
run.Raw('|'),
'sudo',
'xargs',
'-0',
'--no-run-if-empty',
'--',
'gzip',
'--',
],
wait=False,
),
)
log.info('Archiving logs...')
path = os.path.join(ctx.archive, 'remote')
os.makedirs(path)
for remote in ctx.cluster.remotes.iterkeys():
sub = os.path.join(path, remote.shortname)
os.makedirs(sub)
teuthology.pull_directory(remote, '/var/log/ceph',
os.path.join(sub, 'log'))
log.info('Purging package...')
execute_ceph_deploy(ctx, config, purge_nodes)
log.info('Purging data...')
execute_ceph_deploy(ctx, config, purgedata_nodes)
@contextlib.contextmanager
def task(ctx, config):
"""
Set up and tear down a Ceph cluster.
For example::
tasks:
- install:
extras: yes
- ssh_keys:
- ceph-deploy:
branch:
stable: bobtail
mon_initial_members: 1
tasks:
- install:
extras: yes
- ssh_keys:
- ceph-deploy:
branch:
dev: master
conf:
mon:
debug mon = 20
tasks:
- install:
extras: yes
- ssh_keys:
- ceph-deploy:
branch:
testing:
"""
if config is None:
config = {}
overrides = ctx.config.get('overrides', {})
teuthology.deep_merge(config, overrides.get('ceph-deploy', {}))
assert isinstance(config, dict), \
"task ceph-deploy only supports a dictionary for configuration"
overrides = ctx.config.get('overrides', {})
teuthology.deep_merge(config, overrides.get('ceph-deploy', {}))
if config.get('branch') is not None:
assert isinstance(config['branch'], dict), 'branch must be a dictionary'
with contextutil.nested(
lambda: ceph_fn.ship_utilities(ctx=ctx, config=None),
lambda: download_ceph_deploy(ctx=ctx, config=config),
lambda: build_ceph_cluster(ctx=ctx, config=dict(
conf=config.get('conf', {}),
branch=config.get('branch',{}),
mon_initial_members=config.get('mon_initial_members', None),
test_mon_destroy=config.get('test_mon_destroy', None),
)),
):
yield