qa/suites/rados: replace mon_seesaw.py task with a small bash script

The teuthology test did not like the change to remove 'mon addr' from
ceph.conf.  The standalone script is easier to test.

Note that it avoids mon names 'a', 'b', 'c' since the MonMap::build_initial
uses those.

Signed-off-by: Sage Weil <sage@redhat.com>
This commit is contained in:
Sage Weil 2018-12-21 12:26:29 -06:00
parent f857c70c9c
commit 16980bd12f
4 changed files with 90 additions and 230 deletions

72
qa/standalone/mon/mon-seesaw.sh Executable file
View File

@ -0,0 +1,72 @@
#!/usr/bin/env bash
source $CEPH_ROOT/qa/standalone/ceph-helpers.sh
function run() {
local dir=$1
shift
export CEPH_MON_A="127.0.0.1:7139" # git grep '\<7139\>' : there must be only one
export CEPH_MON_B="127.0.0.1:7141" # git grep '\<7141\>' : there must be only one
export CEPH_MON_C="127.0.0.1:7142" # git grep '\<7142\>' : there must be only one
export CEPH_ARGS
CEPH_ARGS+="--fsid=$(uuidgen) --auth-supported=none "
export BASE_CEPH_ARGS=$CEPH_ARGS
CEPH_ARGS+="--mon-host=$CEPH_MON_A "
local funcs=${@:-$(set | sed -n -e 's/^\(TEST_[0-9a-z_]*\) .*/\1/p')}
for func in $funcs ; do
setup $dir || return 1
$func $dir || return 1
teardown $dir || return 1
done
}
function TEST_mon_seesaw() {
local dir=$1
setup $dir || return
# start with 1 mon
run_mon $dir aa --public-addr $CEPH_MON_A || return 1
run_mgr $dir x || return 1
run_osd $dir 0 || return 1
run_osd $dir 1 || return 1
run_osd $dir 2 || return 1
wait_for_quorum 300 1 || return 1
# add in a second
run_mon $dir bb --public-addr $CEPH_MON_B || return 1
CEPH_ARGS="$BASE_CEPH_ARGS --mon-host=$CEPH_MON_A,$CEPH_MON_B"
wait_for_quorum 300 2 || return 1
# remove the first one
ceph mon rm aa || return 1
CEPH_ARGS="$BASE_CEPH_ARGS --mon-host=$CEPH_MON_B"
sleep 5
wait_for_quorum 300 1 || return 1
# do some stuff that requires the osds be able to communicate with the
# mons. (see http://tracker.ceph.com/issues/17558)
ceph osd pool create foo 8
rados -p foo bench 1 write
wait_for_clean || return 1
# nuke monstore so that it will rejoin (otherwise we get
# "not in monmap and have been in a quorum before; must have been removed"
rm -rf $dir/aa
# add a back in
# (use a different addr to avoid bind issues)
run_mon $dir aa --public-addr $CEPH_MON_C || return 1
CEPH_ARGS="$BASE_CEPH_ARGS --mon-host=$CEPH_MON_C,$CEPH_MON_B"
wait_for_quorum 300 2 || return 1
}
main mon-seesaw "$@"
# Local Variables:
# compile-command: "cd ../.. ; make -j4 && test/mon/mon-ping.sh"
# End:

View File

@ -1,32 +0,0 @@
roles:
- - mon.a
- mgr.x
- osd.0
- osd.1
- osd.2
openstack:
- volumes: # attached to each instance
count: 3
size: 10 # GB
tasks:
- install:
- ceph:
config:
global:
osd pool default min size : 1
osd:
debug monc: 1
debug ms: 1
log-whitelist:
- overall HEALTH
- Manager daemon
- \(MGR_DOWN\)
- \(PG_AVAILABILITY\)
- mon_seesaw:
- ceph_manager.create_pool:
kwargs:
pool_name: test
pg_num: 1
- ceph_manager.wait_for_clean:
kwargs:
timeout: 60

View File

@ -0,0 +1,18 @@
roles:
- - mon.a
- mgr.x
- osd.0
- osd.1
- osd.2
- client.0
openstack:
- volumes: # attached to each instance
count: 3
size: 10 # GB
tasks:
- install:
- workunit:
basedir: qa/standalone
clients:
all:
- mon/mon-seesaw.sh

View File

@ -1,198 +0,0 @@
from cStringIO import StringIO
import contextlib
import logging
import random
from teuthology import misc as teuthology
from teuthology.orchestra import run
from ceph_manager import CephManager, write_conf
log = logging.getLogger(__name__)
def _get_mons(ctx):
return [name[len('mon.'):] for name in teuthology.get_mon_names(ctx)]
# teuthology prepares the monitor IPs (and ports) in get_mons(), we can
# enumerate all monitor ports ([6789..]), and find the next available one.
def _get_next_port(ctx, ip, cluster):
# assuming we have only one cluster here.
used = []
for name in teuthology.get_mon_names(ctx, cluster):
addr = ctx.ceph[cluster].conf[name]['mon addr']
addr_type, mon_ip, mon_port = addr.split(':')
if mon_ip != ip:
continue
used.append(int(mon_port))
port = 6789
used.sort()
for p in used:
if p != port:
break
port += 1
return port
def _setup_mon(ctx, manager, remote, mon, name, data_path, conf_path):
# co-locate a new monitor on remote where an existing monitor is hosted
cluster = manager.cluster
remote.run(args=['sudo', 'mkdir', '-p', data_path])
keyring_path = '/etc/ceph/{cluster}.keyring'.format(
cluster=manager.cluster)
testdir = teuthology.get_testdir(ctx)
monmap_path = '{tdir}/{cluster}.monmap'.format(tdir=testdir,
cluster=cluster)
manager.raw_cluster_cmd('mon', 'getmap', '-o', monmap_path)
if manager.controller != remote:
monmap = teuthology.get_file(manager.controller, monmap_path)
teuthology.write_file(remote, monmap_path, StringIO(monmap))
remote.run(
args=[
'sudo',
'ceph-mon',
'--cluster', cluster,
'--mkfs',
'-i', mon,
'--monmap', monmap_path,
'--keyring', keyring_path])
if manager.controller != remote:
teuthology.delete_file(remote, monmap_path)
# raw_cluster_cmd() is performed using sudo, so sudo here also.
teuthology.delete_file(manager.controller, monmap_path, sudo=True)
# update ceph.conf so that the ceph CLI is able to connect to the cluster
if conf_path:
ip = remote.ip_address
port = _get_next_port(ctx, ip, cluster)
mon_addr = '{ip}:{port}'.format(ip=ip, port=port)
ctx.ceph[cluster].conf[name] = {'mon addr': mon_addr}
write_conf(ctx, conf_path, cluster)
def _teardown_mon(ctx, manager, remote, name, data_path, conf_path):
cluster = manager.cluster
del ctx.ceph[cluster].conf[name]
write_conf(ctx, conf_path, cluster)
remote.run(args=['sudo', 'rm', '-rf', data_path])
@contextlib.contextmanager
def _prepare_mon(ctx, manager, remote, mon):
cluster = manager.cluster
data_path = '/var/lib/ceph/mon/{cluster}-{id}'.format(
cluster=cluster, id=mon)
conf_path = '/etc/ceph/{cluster}.conf'.format(cluster=cluster)
name = 'mon.{0}'.format(mon)
_setup_mon(ctx, manager, remote, mon, name, data_path, conf_path)
yield
_teardown_mon(ctx, manager, remote, name,
data_path, conf_path)
# run_daemon() in ceph.py starts a herd of daemons of the same type, but
# _run_daemon() starts only one instance.
@contextlib.contextmanager
def _run_daemon(ctx, remote, cluster, type_, id_):
testdir = teuthology.get_testdir(ctx)
coverage_dir = '{tdir}/archive/coverage'.format(tdir=testdir)
daemon_signal = 'kill'
run_cmd = [
'sudo',
'adjust-ulimits',
'ceph-coverage',
coverage_dir,
'daemon-helper',
daemon_signal,
]
run_cmd_tail = [
'ceph-%s' % (type_),
'-f',
'--cluster', cluster,
'-i', id_]
run_cmd.extend(run_cmd_tail)
ctx.daemons.add_daemon(remote, type_, id_,
cluster=cluster,
args=run_cmd,
logger=log.getChild(type_),
stdin=run.PIPE,
wait=False)
daemon = ctx.daemons.get_daemon(type_, id_, cluster)
yield daemon
daemon.stop()
@contextlib.contextmanager
def task(ctx, config):
"""
replace a monitor with a newly added one, and then revert this change
How it works::
1. add a mon with specified id (mon.victim_prime)
2. wait for quorum
3. remove a monitor with specified id (mon.victim), mon.victim will commit
suicide
4. wait for quorum
5. <yield>
5. add mon.a back, and start it
6. wait for quorum
7. remove mon.a_prime
Options::
victim the id of the mon to be removed (pick a random mon by default)
replacer the id of the new mon (use "${victim}_prime" if not specified)
"""
first_mon = teuthology.get_first_mon(ctx, config)
(mon,) = ctx.cluster.only(first_mon).remotes.iterkeys()
manager = CephManager(mon, ctx=ctx, logger=log.getChild('ceph_manager'))
if config is None:
config = {}
assert isinstance(config, dict), \
"task ceph only supports a dictionary for configuration"
overrides = ctx.config.get('overrides', {})
teuthology.deep_merge(config, overrides.get('mon_seesaw', {}))
victim = config.get('victim', random.choice(_get_mons(ctx)))
replacer = config.get('replacer', '{0}_prime'.format(victim))
remote = manager.find_remote('mon', victim)
quorum = manager.get_mon_quorum()
cluster = manager.cluster
log.info('replacing {victim} with {replacer}'.format(victim=victim,
replacer=replacer))
with _prepare_mon(ctx, manager, remote, replacer):
with _run_daemon(ctx, remote, cluster, 'mon', replacer):
# replacer will join the quorum automatically
manager.wait_for_mon_quorum_size(len(quorum) + 1, 10)
# if we don't remove the victim from monmap, there is chance that
# we are leaving the new joiner with a monmap of 2 mon, and it will
# not able to reach the other one, it will be keeping probing for
# ever.
log.info('removing {mon}'.format(mon=victim))
manager.raw_cluster_cmd('mon', 'remove', victim)
manager.wait_for_mon_quorum_size(len(quorum), 10)
# the victim will commit suicide after being removed from
# monmap, let's wait until it stops.
ctx.daemons.get_daemon('mon', victim, cluster).wait(10)
try:
# perform other tasks
yield
finally:
# bring the victim back online
# nuke the monstore of victim, otherwise it will refuse to boot
# with following message:
#
# not in monmap and have been in a quorum before; must have
# been removed
log.info('re-adding {mon}'.format(mon=victim))
data_path = '/var/lib/ceph/mon/{cluster}-{id}'.format(
cluster=cluster, id=victim)
remote.run(args=['sudo', 'rm', '-rf', data_path])
name = 'mon.{0}'.format(victim)
_setup_mon(ctx, manager, remote, victim, name, data_path, None)
log.info('reviving {mon}'.format(mon=victim))
manager.revive_mon(victim)
manager.wait_for_mon_quorum_size(len(quorum) + 1, 10)
manager.raw_cluster_cmd('mon', 'remove', replacer)
manager.wait_for_mon_quorum_size(len(quorum), 10)