repair_test: add test for repairing read errs and truncations

Signed-off-by: Samuel Just <sam.just@inktank.com>
Reviewed-by: Greg Farnum <greg@inktank.com>
This commit is contained in:
Samuel Just 2013-03-27 12:11:04 -07:00
parent 2a1cdda90d
commit d81babffe5
2 changed files with 188 additions and 2 deletions

View File

@ -314,6 +314,30 @@ class CephManager:
)
return proc
def do_put(self, pool, obj, fname):
return self.do_rados(
self.controller,
[
'-p',
pool,
'put',
obj,
fname
]
)
def do_get(self, pool, obj, fname='/dev/null'):
return self.do_rados(
self.controller,
[
'-p',
pool,
'stat',
obj,
fname
]
)
def osd_admin_socket(self, osdnum, command, check_status=True):
testdir = teuthology.get_testdir(self.ctx)
remote = None
@ -339,14 +363,32 @@ class CephManager:
check_status=check_status
)
def get_pgid(self, pool, pgnum):
poolnum = self.get_pool_num(pool)
pg_str = "{poolnum}.{pgnum}".format(
poolnum=poolnum,
pgnum=pgnum)
return pg_str
def get_pg_replica(self, pool, pgnum):
"""
get replica for pool, pgnum (e.g. (data, 0)->0
"""
output = self.raw_cluster_cmd("pg", "dump", '--format=json')
j = json.loads('\n'.join(output.split('\n')[1:]))
pg_str = self.get_pgid(pool, pgnum)
for pg in j['pg_stats']:
if pg['pgid'] == pg_str:
return int(pg['acting'][-1])
assert False
def get_pg_primary(self, pool, pgnum):
"""
get primary for pool, pgnum (e.g. (data, 0)->0
"""
poolnum = self.get_pool_num(pool)
output = self.raw_cluster_cmd("pg", "dump", '--format=json')
j = json.loads('\n'.join(output.split('\n')[1:]))
pg_str = "%d.%d" % (poolnum, pgnum)
pg_str = self.get_pgid(pool, pgnum)
for pg in j['pg_stats']:
if pg['pgid'] == pg_str:
return int(pg['acting'][0])
@ -554,6 +596,32 @@ class CephManager:
ret[status] += 1
return ret
def pg_scrubbing(self, pool, pgnum):
pgstr = self.get_pgid(pool, pgnum)
stats = self.get_single_pg_stats(pgstr)
return 'scrub' in stats['state']
def pg_repairing(self, pool, pgnum):
pgstr = self.get_pgid(pool, pgnum)
stats = self.get_single_pg_stats(pgstr)
return 'repair' in stats['state']
def pg_inconsistent(self, pool, pgnum):
pgstr = self.get_pgid(pool, pgnum)
stats = self.get_single_pg_stats(pgstr)
return 'inconsistent' in stats['state']
def get_last_scrub_stamp(self, pool, pgnum):
stats = self.get_single_pg_stats(self.get_pgid(pool, pgnum))
return stats["last_scrub_stamp"]
def do_pg_scrub(self, pool, pgnum, stype):
init = self.get_last_scrub_stamp(pool, pgnum)
self.raw_cluster_cmd('pg', stype, self.get_pgid(pool, pgnum))
while init == self.get_last_scrub_stamp(pool, pgnum):
self.log("waiting for scrub type %s"%(stype,))
time.sleep(10)
def get_single_pg_stats(self, pgid):
all_stats = self.get_pg_stats()

View File

@ -0,0 +1,118 @@
import logging
import ceph_manager
from teuthology import misc as teuthology
log = logging.getLogger(__name__)
def setup(ctx, config):
ctx.manager.wait_for_clean()
ctx.manager.create_pool("repair_test_pool", 1)
return "repair_test_pool"
def teardown(ctx, config, pool):
ctx.manager.remove_pool(pool)
ctx.manager.wait_for_clean()
def run_test(ctx, config, test):
s = setup(ctx, config)
test(ctx, config, s)
teardown(ctx, config, s)
def choose_primary(ctx):
def ret(pool, num):
log.info("Choosing primary")
return ctx.manager.get_pg_primary(pool, num)
return ret
def choose_replica(ctx):
def ret(pool, num):
log.info("Choosing replica")
return ctx.manager.get_pg_replica(pool, num)
return ret
def trunc(ctx):
def ret(osd, pool, obj):
log.info("truncating object")
return ctx.manager.osd_admin_socket(
osd,
['truncobj', pool, obj, '1'])
return ret
def dataerr(ctx):
def ret(osd, pool, obj):
log.info("injecting data err on object")
return ctx.manager.osd_admin_socket(
osd,
['injectdataerr', pool, obj])
return ret
def mdataerr(ctx):
def ret(osd, pool, obj):
log.info("injecting mdata err on object")
return ctx.manager.osd_admin_socket(
osd,
['injectmdataerr', pool, obj])
return ret
def gen_repair_test(corrupter, chooser, scrub_type):
def ret(ctx, config, pool):
log.info("starting repair test")
victim_osd = chooser(pool, 0)
# create object
log.info("doing put")
ctx.manager.do_put(pool, 'repair_test_obj', '/etc/hosts')
# corrupt object
log.info("corrupting object")
corrupter(victim_osd, pool, 'repair_test_obj')
# verify inconsistent
log.info("scrubbing")
ctx.manager.do_pg_scrub(pool, 0, scrub_type)
assert ctx.manager.pg_inconsistent(pool, 0)
# repair
log.info("repairing")
ctx.manager.do_pg_scrub(pool, 0, "repair")
log.info("re-scrubbing")
ctx.manager.do_pg_scrub(pool, 0, scrub_type)
# verify consistent
assert not ctx.manager.pg_inconsistent(pool, 0)
log.info("done")
return ret
def task(ctx, config):
"""
Test [deep] repair in several situations:
Repair [Truncate, Data EIO, MData EIO] on [Primary|Replica]
"""
if config is None:
config = {}
assert isinstance(config, dict), \
'repair_test task only accepts a dict for config'
if not hasattr(ctx, 'manager'):
first_mon = teuthology.get_first_mon(ctx, config)
(mon,) = ctx.cluster.only(first_mon).remotes.iterkeys()
ctx.manager = ceph_manager.CephManager(
mon,
ctx=ctx,
logger=log.getChild('ceph_manager')
)
tests = [
gen_repair_test(mdataerr(ctx), choose_primary(ctx), "scrub"),
gen_repair_test(mdataerr(ctx), choose_replica(ctx), "scrub"),
gen_repair_test(dataerr(ctx), choose_primary(ctx), "deep-scrub"),
gen_repair_test(dataerr(ctx), choose_replica(ctx), "deep-scrub"),
gen_repair_test(trunc(ctx), choose_primary(ctx), "scrub"),
gen_repair_test(trunc(ctx), choose_replica(ctx), "scrub")
]
for test in tests:
run_test(ctx, config, test)