mirror of
https://github.com/ceph/ceph
synced 2024-12-23 11:54:11 +00:00
5765fde1aa
ignore errors on informational service status
465 lines
17 KiB
Python
465 lines
17 KiB
Python
"""
|
|
Execute ceph-deploy as a task
|
|
"""
|
|
from cStringIO import StringIO
|
|
|
|
import contextlib
|
|
import os
|
|
import time
|
|
import logging
|
|
|
|
from teuthology import misc as teuthology
|
|
from teuthology import contextutil
|
|
from teuthology.config import config as teuth_config
|
|
from teuthology.task import install as install_fn
|
|
from teuthology.orchestra import run
|
|
|
|
log = logging.getLogger(__name__)
|
|
|
|
|
|
@contextlib.contextmanager
|
|
def download_ceph_deploy(ctx, config):
|
|
"""
|
|
Downloads ceph-deploy from the ceph.com git mirror and (by default)
|
|
switches to the master branch. If the `ceph-deploy-branch` is specified, it
|
|
will use that instead.
|
|
"""
|
|
log.info('Downloading ceph-deploy...')
|
|
testdir = teuthology.get_testdir(ctx)
|
|
ceph_admin = teuthology.get_first_mon(ctx, config)
|
|
default_cd_branch = {'ceph-deploy-branch': 'master'}
|
|
ceph_deploy_branch = config.get(
|
|
'ceph-deploy',
|
|
default_cd_branch).get('ceph-deploy-branch')
|
|
|
|
ctx.cluster.only(ceph_admin).run(
|
|
args=[
|
|
'git', 'clone', '-b', ceph_deploy_branch,
|
|
teuth_config.ceph_git_base_url + 'ceph-deploy.git',
|
|
'{tdir}/ceph-deploy'.format(tdir=testdir),
|
|
],
|
|
)
|
|
ctx.cluster.only(ceph_admin).run(
|
|
args=[
|
|
'cd',
|
|
'{tdir}/ceph-deploy'.format(tdir=testdir),
|
|
run.Raw('&&'),
|
|
'./bootstrap',
|
|
],
|
|
)
|
|
|
|
try:
|
|
yield
|
|
finally:
|
|
log.info('Removing ceph-deploy ...')
|
|
ctx.cluster.only(ceph_admin).run(
|
|
args=[
|
|
'rm',
|
|
'-rf',
|
|
'{tdir}/ceph-deploy'.format(tdir=testdir),
|
|
],
|
|
)
|
|
|
|
|
|
def is_healthy(ctx, config):
|
|
"""Wait until a Ceph cluster is healthy."""
|
|
testdir = teuthology.get_testdir(ctx)
|
|
ceph_admin = teuthology.get_first_mon(ctx, config)
|
|
(remote,) = ctx.cluster.only(ceph_admin).remotes.keys()
|
|
max_tries = 90 # 90 tries * 10 secs --> 15 minutes
|
|
tries = 0
|
|
while True:
|
|
tries += 1
|
|
if tries >= max_tries:
|
|
msg = "ceph health was unable to get 'HEALTH_OK' after waiting 15 minutes"
|
|
raise RuntimeError(msg)
|
|
|
|
r = remote.run(
|
|
args=[
|
|
'cd',
|
|
'{tdir}'.format(tdir=testdir),
|
|
run.Raw('&&'),
|
|
'sudo', 'ceph',
|
|
'health',
|
|
],
|
|
stdout=StringIO(),
|
|
logger=log.getChild('health'),
|
|
)
|
|
out = r.stdout.getvalue()
|
|
log.debug('Ceph health: %s', out.rstrip('\n'))
|
|
if out.split(None, 1)[0] == 'HEALTH_OK':
|
|
break
|
|
time.sleep(10)
|
|
|
|
def get_nodes_using_roles(ctx, config, role):
|
|
"""Extract the names of nodes that match a given role from a cluster"""
|
|
newl = []
|
|
for _remote, roles_for_host in ctx.cluster.remotes.iteritems():
|
|
for id_ in teuthology.roles_of_type(roles_for_host, role):
|
|
rem = _remote
|
|
if role == 'mon':
|
|
req1 = str(rem).split('@')[-1]
|
|
else:
|
|
req = str(rem).split('.')[0]
|
|
req1 = str(req).split('@')[1]
|
|
newl.append(req1)
|
|
return newl
|
|
|
|
def get_dev_for_osd(ctx, config):
|
|
"""Get a list of all osd device names."""
|
|
osd_devs = []
|
|
for remote, roles_for_host in ctx.cluster.remotes.iteritems():
|
|
host = remote.name.split('@')[-1]
|
|
shortname = host.split('.')[0]
|
|
devs = teuthology.get_scratch_devices(remote)
|
|
num_osd_per_host = list(teuthology.roles_of_type(roles_for_host, 'osd'))
|
|
num_osds = len(num_osd_per_host)
|
|
assert num_osds <= len(devs), 'fewer disks than osds on ' + shortname
|
|
for dev in devs[:num_osds]:
|
|
dev_short = dev.split('/')[-1]
|
|
osd_devs.append('{host}:{dev}'.format(host=shortname, dev=dev_short))
|
|
return osd_devs
|
|
|
|
def get_all_nodes(ctx, config):
|
|
"""Return a string of node names separated by blanks"""
|
|
nodelist = []
|
|
for t, k in ctx.config['targets'].iteritems():
|
|
host = t.split('@')[-1]
|
|
simple_host = host.split('.')[0]
|
|
nodelist.append(simple_host)
|
|
nodelist = " ".join(nodelist)
|
|
return nodelist
|
|
|
|
def execute_ceph_deploy(ctx, config, cmd):
|
|
"""Remotely execute a ceph_deploy command"""
|
|
testdir = teuthology.get_testdir(ctx)
|
|
ceph_admin = teuthology.get_first_mon(ctx, config)
|
|
exec_cmd = cmd
|
|
(remote,) = ctx.cluster.only(ceph_admin).remotes.iterkeys()
|
|
proc = remote.run(
|
|
args = [
|
|
'cd',
|
|
'{tdir}/ceph-deploy'.format(tdir=testdir),
|
|
run.Raw('&&'),
|
|
run.Raw(exec_cmd),
|
|
],
|
|
check_status=False,
|
|
)
|
|
exitstatus = proc.exitstatus
|
|
return exitstatus
|
|
|
|
|
|
@contextlib.contextmanager
|
|
def build_ceph_cluster(ctx, config):
|
|
"""Build a ceph cluster"""
|
|
|
|
try:
|
|
log.info('Building ceph cluster using ceph-deploy...')
|
|
testdir = teuthology.get_testdir(ctx)
|
|
ceph_branch = None
|
|
if config.get('branch') is not None:
|
|
cbranch = config.get('branch')
|
|
for var, val in cbranch.iteritems():
|
|
if var == 'testing':
|
|
ceph_branch = '--{var}'.format(var=var)
|
|
ceph_branch = '--{var}={val}'.format(var=var, val=val)
|
|
node_dev_list = []
|
|
all_nodes = get_all_nodes(ctx, config)
|
|
mds_nodes = get_nodes_using_roles(ctx, config, 'mds')
|
|
mds_nodes = " ".join(mds_nodes)
|
|
mon_node = get_nodes_using_roles(ctx, config, 'mon')
|
|
mon_nodes = " ".join(mon_node)
|
|
new_mon = './ceph-deploy new'+" "+mon_nodes
|
|
install_nodes = './ceph-deploy install '+ceph_branch+" "+all_nodes
|
|
purge_nodes = './ceph-deploy purge'+" "+all_nodes
|
|
purgedata_nodes = './ceph-deploy purgedata'+" "+all_nodes
|
|
mon_hostname = mon_nodes.split(' ')[0]
|
|
mon_hostname = str(mon_hostname)
|
|
gather_keys = './ceph-deploy gatherkeys'+" "+mon_hostname
|
|
deploy_mds = './ceph-deploy mds create'+" "+mds_nodes
|
|
no_of_osds = 0
|
|
|
|
if mon_nodes is None:
|
|
raise RuntimeError("no monitor nodes in the config file")
|
|
|
|
estatus_new = execute_ceph_deploy(ctx, config, new_mon)
|
|
if estatus_new != 0:
|
|
raise RuntimeError("ceph-deploy: new command failed")
|
|
|
|
log.info('adding config inputs...')
|
|
testdir = teuthology.get_testdir(ctx)
|
|
conf_path = '{tdir}/ceph-deploy/ceph.conf'.format(tdir=testdir)
|
|
first_mon = teuthology.get_first_mon(ctx, config)
|
|
(remote,) = ctx.cluster.only(first_mon).remotes.keys()
|
|
|
|
lines = None
|
|
if config.get('conf') is not None:
|
|
confp = config.get('conf')
|
|
for section, keys in confp.iteritems():
|
|
lines = '[{section}]\n'.format(section=section)
|
|
teuthology.append_lines_to_file(remote, conf_path, lines,
|
|
sudo=True)
|
|
for key, value in keys.iteritems():
|
|
log.info("[%s] %s = %s" % (section, key, value))
|
|
lines = '{key} = {value}\n'.format(key=key, value=value)
|
|
teuthology.append_lines_to_file(remote, conf_path, lines,
|
|
sudo=True)
|
|
|
|
estatus_install = execute_ceph_deploy(ctx, config, install_nodes)
|
|
if estatus_install != 0:
|
|
raise RuntimeError("ceph-deploy: Failed to install ceph")
|
|
|
|
mon_create_nodes = './ceph-deploy mon create-initial'
|
|
# If the following fails, it is OK, it might just be that the monitors
|
|
# are taking way more than a minute/monitor to form quorum, so lets
|
|
# try the next block which will wait up to 15 minutes to gatherkeys.
|
|
estatus_mon = execute_ceph_deploy(ctx, config, mon_create_nodes)
|
|
|
|
estatus_gather = execute_ceph_deploy(ctx, config, gather_keys)
|
|
max_gather_tries = 90
|
|
gather_tries = 0
|
|
while (estatus_gather != 0):
|
|
gather_tries += 1
|
|
if gather_tries >= max_gather_tries:
|
|
msg = 'ceph-deploy was not able to gatherkeys after 15 minutes'
|
|
raise RuntimeError(msg)
|
|
estatus_gather = execute_ceph_deploy(ctx, config, gather_keys)
|
|
time.sleep(10)
|
|
|
|
if mds_nodes:
|
|
estatus_mds = execute_ceph_deploy(ctx, config, deploy_mds)
|
|
if estatus_mds != 0:
|
|
raise RuntimeError("ceph-deploy: Failed to deploy mds")
|
|
|
|
if config.get('test_mon_destroy') is not None:
|
|
for d in range(1, len(mon_node)):
|
|
mon_destroy_nodes = './ceph-deploy mon destroy'+" "+mon_node[d]
|
|
estatus_mon_d = execute_ceph_deploy(ctx, config,
|
|
mon_destroy_nodes)
|
|
if estatus_mon_d != 0:
|
|
raise RuntimeError("ceph-deploy: Failed to delete monitor")
|
|
|
|
node_dev_list = get_dev_for_osd(ctx, config)
|
|
for d in node_dev_list:
|
|
osd_create_cmds = './ceph-deploy osd create --zap-disk'+" "+d
|
|
estatus_osd = execute_ceph_deploy(ctx, config, osd_create_cmds)
|
|
if estatus_osd == 0:
|
|
log.info('successfully created osd')
|
|
no_of_osds += 1
|
|
else:
|
|
zap_disk = './ceph-deploy disk zap'+" "+d
|
|
execute_ceph_deploy(ctx, config, zap_disk)
|
|
estatus_osd = execute_ceph_deploy(ctx, config, osd_create_cmds)
|
|
if estatus_osd == 0:
|
|
log.info('successfully created osd')
|
|
no_of_osds += 1
|
|
else:
|
|
raise RuntimeError("ceph-deploy: Failed to create osds")
|
|
|
|
if config.get('wait-for-healthy', True) and no_of_osds >= 2:
|
|
is_healthy(ctx=ctx, config=None)
|
|
|
|
log.info('Setting up client nodes...')
|
|
conf_path = '/etc/ceph/ceph.conf'
|
|
admin_keyring_path = '/etc/ceph/ceph.client.admin.keyring'
|
|
first_mon = teuthology.get_first_mon(ctx, config)
|
|
(mon0_remote,) = ctx.cluster.only(first_mon).remotes.keys()
|
|
conf_data = teuthology.get_file(
|
|
remote=mon0_remote,
|
|
path=conf_path,
|
|
sudo=True,
|
|
)
|
|
admin_keyring = teuthology.get_file(
|
|
remote=mon0_remote,
|
|
path=admin_keyring_path,
|
|
sudo=True,
|
|
)
|
|
|
|
clients = ctx.cluster.only(teuthology.is_type('client'))
|
|
for remot, roles_for_host in clients.remotes.iteritems():
|
|
for id_ in teuthology.roles_of_type(roles_for_host, 'client'):
|
|
client_keyring = \
|
|
'/etc/ceph/ceph.client.{id}.keyring'.format(id=id_)
|
|
mon0_remote.run(
|
|
args=[
|
|
'cd',
|
|
'{tdir}'.format(tdir=testdir),
|
|
run.Raw('&&'),
|
|
'sudo', 'bash', '-c',
|
|
run.Raw('"'), 'ceph',
|
|
'auth',
|
|
'get-or-create',
|
|
'client.{id}'.format(id=id_),
|
|
'mds', 'allow',
|
|
'mon', 'allow *',
|
|
'osd', 'allow *',
|
|
run.Raw('>'),
|
|
client_keyring,
|
|
run.Raw('"'),
|
|
],
|
|
)
|
|
key_data = teuthology.get_file(
|
|
remote=mon0_remote,
|
|
path=client_keyring,
|
|
sudo=True,
|
|
)
|
|
teuthology.sudo_write_file(
|
|
remote=remot,
|
|
path=client_keyring,
|
|
data=key_data,
|
|
perms='0644'
|
|
)
|
|
teuthology.sudo_write_file(
|
|
remote=remot,
|
|
path=admin_keyring_path,
|
|
data=admin_keyring,
|
|
perms='0644'
|
|
)
|
|
teuthology.sudo_write_file(
|
|
remote=remot,
|
|
path=conf_path,
|
|
data=conf_data,
|
|
perms='0644'
|
|
)
|
|
else:
|
|
raise RuntimeError(
|
|
"The cluster is NOT operational due to insufficient OSDs")
|
|
yield
|
|
|
|
finally:
|
|
log.info('Stopping ceph...')
|
|
ctx.cluster.run(args=['sudo', 'stop', 'ceph-all', run.Raw('||'),
|
|
'sudo', 'service', 'ceph', 'stop' ])
|
|
|
|
# Are you really not running anymore?
|
|
# try first with the init tooling
|
|
# ignoring the status so this becomes informational only
|
|
ctx.cluster.run(args=['sudo', 'status', 'ceph-all', run.Raw('||'),
|
|
'sudo', 'service', 'ceph', 'status'],
|
|
check_status=False)
|
|
|
|
# and now just check for the processes themselves, as if upstart/sysvinit
|
|
# is lying to us. Ignore errors if the grep fails
|
|
ctx.cluster.run(args=['sudo', 'ps', 'aux', run.Raw('|'),
|
|
'grep', '-v', 'grep', run.Raw('|'),
|
|
'grep', 'ceph'], check_status=False)
|
|
|
|
if ctx.archive is not None:
|
|
# archive mon data, too
|
|
log.info('Archiving mon data...')
|
|
path = os.path.join(ctx.archive, 'data')
|
|
os.makedirs(path)
|
|
mons = ctx.cluster.only(teuthology.is_type('mon'))
|
|
for remote, roles in mons.remotes.iteritems():
|
|
for role in roles:
|
|
if role.startswith('mon.'):
|
|
teuthology.pull_directory_tarball(
|
|
remote,
|
|
'/var/lib/ceph/mon',
|
|
path + '/' + role + '.tgz')
|
|
|
|
log.info('Compressing logs...')
|
|
run.wait(
|
|
ctx.cluster.run(
|
|
args=[
|
|
'sudo',
|
|
'find',
|
|
'/var/log/ceph',
|
|
'-name',
|
|
'*.log',
|
|
'-print0',
|
|
run.Raw('|'),
|
|
'sudo',
|
|
'xargs',
|
|
'-0',
|
|
'--no-run-if-empty',
|
|
'--',
|
|
'gzip',
|
|
'--',
|
|
],
|
|
wait=False,
|
|
),
|
|
)
|
|
|
|
log.info('Archiving logs...')
|
|
path = os.path.join(ctx.archive, 'remote')
|
|
os.makedirs(path)
|
|
for remote in ctx.cluster.remotes.iterkeys():
|
|
sub = os.path.join(path, remote.shortname)
|
|
os.makedirs(sub)
|
|
teuthology.pull_directory(remote, '/var/log/ceph',
|
|
os.path.join(sub, 'log'))
|
|
|
|
# Prevent these from being undefined if the try block fails
|
|
all_nodes = get_all_nodes(ctx, config)
|
|
purge_nodes = './ceph-deploy purge'+" "+all_nodes
|
|
purgedata_nodes = './ceph-deploy purgedata'+" "+all_nodes
|
|
|
|
log.info('Purging package...')
|
|
execute_ceph_deploy(ctx, config, purge_nodes)
|
|
log.info('Purging data...')
|
|
execute_ceph_deploy(ctx, config, purgedata_nodes)
|
|
|
|
|
|
@contextlib.contextmanager
|
|
def task(ctx, config):
|
|
"""
|
|
Set up and tear down a Ceph cluster.
|
|
|
|
For example::
|
|
|
|
tasks:
|
|
- install:
|
|
extras: yes
|
|
- ssh_keys:
|
|
- ceph-deploy:
|
|
branch:
|
|
stable: bobtail
|
|
mon_initial_members: 1
|
|
|
|
tasks:
|
|
- install:
|
|
extras: yes
|
|
- ssh_keys:
|
|
- ceph-deploy:
|
|
branch:
|
|
dev: master
|
|
conf:
|
|
mon:
|
|
debug mon = 20
|
|
|
|
tasks:
|
|
- install:
|
|
extras: yes
|
|
- ssh_keys:
|
|
- ceph-deploy:
|
|
branch:
|
|
testing:
|
|
"""
|
|
if config is None:
|
|
config = {}
|
|
|
|
overrides = ctx.config.get('overrides', {})
|
|
teuthology.deep_merge(config, overrides.get('ceph-deploy', {}))
|
|
|
|
assert isinstance(config, dict), \
|
|
"task ceph-deploy only supports a dictionary for configuration"
|
|
|
|
overrides = ctx.config.get('overrides', {})
|
|
teuthology.deep_merge(config, overrides.get('ceph-deploy', {}))
|
|
|
|
if config.get('branch') is not None:
|
|
assert isinstance(config['branch'], dict), 'branch must be a dictionary'
|
|
|
|
with contextutil.nested(
|
|
lambda: install_fn.ship_utilities(ctx=ctx, config=None),
|
|
lambda: download_ceph_deploy(ctx=ctx, config=config),
|
|
lambda: build_ceph_cluster(ctx=ctx, config=dict(
|
|
conf=config.get('conf', {}),
|
|
branch=config.get('branch',{}),
|
|
mon_initial_members=config.get('mon_initial_members', None),
|
|
test_mon_destroy=config.get('test_mon_destroy', None),
|
|
)),
|
|
):
|
|
yield
|