ceph/tasks/rebuild_mondb.py
Kefu Chai a192c786b3 tasks: add rebuild_mondb
* tasks/rebuild_mondb.py: this task
  1. removes all store.db on all monitors
  2. rebuild the store.db for the first mon
  3. start the first mon
  4. run mkfs on other mon
  5. and revive them
* suites/rados/singleton/all/rebuild-mon-db.yaml
  1. run rados/test.sh
  2. run rebuild_mondb task

Fixes: http://tracker.ceph.com/issues/17179
Signed-off-by: Kefu Chai <kchai@redhat.com>
2016-09-08 21:35:01 +08:00

183 lines
6.6 KiB
Python

"""
Test if we can recover the leveldb from OSD after where all leveldbs are
corrupted
"""
import logging
import os.path
import shutil
import tempfile
import ceph_manager
from teuthology import misc as teuthology
log = logging.getLogger(__name__)
def push_directory(path, remote, remote_dir):
"""
local_temp_path=`mktemp`
tar czf $local_temp_path $path
ssh remote mkdir -p remote_dir
remote_temp_path=`mktemp`
scp $local_temp_path $remote_temp_path
rm $local_temp_path
tar xzf $remote_temp_path -C $remote_dir
ssh remote:$remote_temp_path
"""
fd, local_temp_path = tempfile.mkstemp(suffix='.tgz',
prefix='rebuild_mondb-')
os.close(fd)
cmd = ' '.join(['tar', 'cz',
'-f', local_temp_path,
'-C', path,
'--', '.'])
teuthology.sh(cmd)
_, fname = os.path.split(local_temp_path)
fd, remote_temp_path = tempfile.mkstemp(suffix='.tgz',
prefix='rebuild_mondb-')
os.close(fd)
remote.put_file(local_temp_path, remote_temp_path)
os.remove(local_temp_path)
remote.run(args=['sudo',
'tar', 'xz',
'-C', remote_dir,
'-f', remote_temp_path])
remote.run(args=['sudo', 'rm', '-fr', remote_temp_path])
def task(ctx, config):
"""
Test monitor recovery from OSD
"""
if config is None:
config = {}
assert isinstance(config, dict), \
'task only accepts a dict for configuration'
first_mon = teuthology.get_first_mon(ctx, config)
(mon,) = ctx.cluster.only(first_mon).remotes.iterkeys()
manager = ceph_manager.CephManager(
mon,
ctx=ctx,
logger=log.getChild('ceph_manager'))
mons = ctx.cluster.only(teuthology.is_type('mon'))
assert mons
# note down the first cluster_name and mon_id
# we will recover it later on
cluster_name = None
mon_id = None
for remote, roles in mons.remotes.iteritems():
is_mon = teuthology.is_type('mon')
for role in roles:
if not is_mon(role):
continue
cluster, _, m = teuthology.split_role(role)
if cluster_name is None:
cluster_name = cluster
mon_id = m
assert cluster_name == cluster
log.info('killing {cluster}:mon.{mon}'.format(
cluster=cluster,
mon=m))
manager.kill_mon(m)
mon_data = os.path.join('/var/lib/ceph/mon/',
'{0}-{1}'.format(cluster_name, m))
if m == mon_id:
# so we will only need to recreate the store.db for the
# first mon, would be easier than mkfs on it then replace
# the its store.db with the recovered one
store_dir = os.path.join(mon_data, 'store.db')
remote.run(args=['sudo', 'rm', '-r', store_dir])
else:
remote.run(args=['sudo', 'rm', '-r', mon_data])
local_mstore = tempfile.mkdtemp()
# collect the maps from all OSDs
osds = ctx.cluster.only(teuthology.is_type('osd'))
assert osds
for osd, roles in osds.remotes.iteritems():
is_osd = teuthology.is_type('osd')
for role in roles:
if not is_osd(role):
continue
cluster, _, osd_id = teuthology.split_role(role)
assert cluster_name == cluster
log.info('collecting maps from {cluster}:osd.{osd}'.format(
cluster=cluster,
osd=osd_id))
# push leveldb to OSD
osd_mstore = os.path.join(teuthology.get_testdir(ctx), 'mon-store')
osd.run(args=['sudo', 'mkdir', '-m', 'o+x', '-p', osd_mstore])
push_directory(local_mstore, osd, osd_mstore)
log.info('rm -rf {0}'.format(local_mstore))
shutil.rmtree(local_mstore)
# update leveldb with OSD data
options = '--op update-mon-db --mon-store-path {0}'
log.info('cot {0}'.format(osd_mstore))
manager.objectstore_tool(pool=None,
options=options.format(osd_mstore),
args='',
osd=osd_id,
do_revive=False)
# pull the updated mon db
log.info('pull dir {0} -> {1}'.format(osd_mstore, local_mstore))
local_mstore = tempfile.mkdtemp()
teuthology.pull_directory(osd, osd_mstore, local_mstore)
log.info('rm -rf osd:{0}'.format(osd_mstore))
osd.run(args=['sudo', 'rm', '-fr', osd_mstore])
# recover the first_mon with re-built mon db
# pull from recovered leveldb from client
mon_store_dir = os.path.join('/var/lib/ceph/mon',
'{0}-{1}'.format(cluster_name, mon_id))
push_directory(local_mstore, mon, mon_store_dir)
mon.run(args=['sudo', 'chown', '-R', 'ceph:ceph', mon_store_dir])
shutil.rmtree(local_mstore)
default_keyring = '/etc/ceph/{cluster}.keyring'.format(
cluster=cluster_name)
keyring_path = config.get('keyring_path', default_keyring)
mon.run(args=['sudo', '-u', 'ceph',
'ceph-monstore-tool', mon_store_dir,
'rebuild', '--', '--keyring',
keyring_path])
# revive monitors
# the initial monmap is in the ceph.conf, so we are good.
n_mons = 0
for remote, roles in mons.remotes.iteritems():
is_mon = teuthology.is_type('mon')
for role in roles:
if not is_mon(role):
continue
cluster, _, m = teuthology.split_role(role)
assert cluster_name == cluster
if mon_id != m:
log.info('running mkfs on {cluster}:mon.{mon}'.format(
cluster=cluster,
mon=m))
remote.run(
args=[
'sudo',
'ceph-mon',
'--cluster', cluster,
'--mkfs',
'-i', m,
'--keyring', keyring_path])
manager.revive_mon(m)
n_mons += 1
manager.wait_for_mon_quorum_size(n_mons, timeout=30)
for osd, roles in osds.remotes.iteritems():
is_osd = teuthology.is_type('osd')
for role in roles:
if not is_osd(role):
continue
_, _, osd_id = teuthology.split_role(role)
log.info('reviving osd.{0}'.format(osd_id))
manager.revive_osd(osd_id)