2014-05-13 17:45:09 +00:00
|
|
|
"""
|
|
|
|
Test pool repairing after objects are damaged.
|
|
|
|
"""
|
2013-03-27 19:11:04 +00:00
|
|
|
import logging
|
2013-04-24 22:50:28 +00:00
|
|
|
import time
|
2014-11-21 15:13:26 +00:00
|
|
|
import contextlib
|
2013-03-27 19:11:04 +00:00
|
|
|
|
|
|
|
import ceph_manager
|
|
|
|
from teuthology import misc as teuthology
|
|
|
|
|
|
|
|
log = logging.getLogger(__name__)
|
|
|
|
|
2014-11-21 15:13:26 +00:00
|
|
|
def choose_primary(ctx, pool, num):
|
2014-05-13 17:45:09 +00:00
|
|
|
"""
|
2014-11-21 15:13:26 +00:00
|
|
|
Return primary to test on.
|
2014-05-13 17:45:09 +00:00
|
|
|
"""
|
2014-11-21 15:13:26 +00:00
|
|
|
log.info("Choosing primary")
|
|
|
|
return ctx.manager.get_pg_primary(pool, num)
|
2013-03-27 19:11:04 +00:00
|
|
|
|
2014-11-21 15:13:26 +00:00
|
|
|
def choose_replica(ctx, pool, num):
|
2014-05-13 17:45:09 +00:00
|
|
|
"""
|
2014-11-21 15:13:26 +00:00
|
|
|
Return replica to test on.
|
2014-05-13 17:45:09 +00:00
|
|
|
"""
|
2014-11-21 15:13:26 +00:00
|
|
|
log.info("Choosing replica")
|
|
|
|
return ctx.manager.get_pg_replica(pool, num)
|
2013-03-27 19:11:04 +00:00
|
|
|
|
2014-11-21 15:13:26 +00:00
|
|
|
def trunc(ctx, osd, pool, obj):
|
2014-05-13 17:45:09 +00:00
|
|
|
"""
|
2014-11-21 15:13:26 +00:00
|
|
|
truncate an object
|
2014-05-13 17:45:09 +00:00
|
|
|
"""
|
2014-11-21 15:13:26 +00:00
|
|
|
log.info("truncating object")
|
|
|
|
return ctx.manager.osd_admin_socket(
|
|
|
|
osd,
|
|
|
|
['truncobj', pool, obj, '1'])
|
2013-03-27 19:11:04 +00:00
|
|
|
|
2014-11-21 15:13:26 +00:00
|
|
|
def dataerr(ctx, osd, pool, obj):
|
2014-05-13 17:45:09 +00:00
|
|
|
"""
|
2014-11-21 15:13:26 +00:00
|
|
|
cause an error in the data
|
2014-05-13 17:45:09 +00:00
|
|
|
"""
|
2014-11-21 15:13:26 +00:00
|
|
|
log.info("injecting data err on object")
|
|
|
|
return ctx.manager.osd_admin_socket(
|
|
|
|
osd,
|
|
|
|
['injectdataerr', pool, obj])
|
|
|
|
|
|
|
|
def mdataerr(ctx, osd, pool, obj):
|
2014-05-13 17:45:09 +00:00
|
|
|
"""
|
2014-11-21 15:13:26 +00:00
|
|
|
cause an error in the mdata
|
2014-05-13 17:45:09 +00:00
|
|
|
"""
|
2014-11-21 15:13:26 +00:00
|
|
|
log.info("injecting mdata err on object")
|
|
|
|
return ctx.manager.osd_admin_socket(
|
|
|
|
osd,
|
|
|
|
['injectmdataerr', pool, obj])
|
|
|
|
|
|
|
|
def omaperr(ctx, osd, pool, obj):
|
2014-05-13 17:45:09 +00:00
|
|
|
"""
|
2014-11-21 15:13:26 +00:00
|
|
|
Cause an omap error.
|
2014-05-13 17:45:09 +00:00
|
|
|
"""
|
2014-11-21 15:13:26 +00:00
|
|
|
log.info("injecting omap err on object")
|
|
|
|
return ctx.manager.osd_admin_socket(osd, ['setomapval', pool, obj, 'badkey', 'badval']);
|
2014-05-13 17:45:09 +00:00
|
|
|
|
2014-11-21 15:13:26 +00:00
|
|
|
def repair_test_1(ctx, corrupter, chooser, scrub_type):
|
|
|
|
"""
|
|
|
|
Creates an object in the pool, corrupts it,
|
2014-05-13 17:45:09 +00:00
|
|
|
scrubs it, and verifies that the pool is inconsistent. It then repairs
|
|
|
|
the pool, rescrubs it, and verifies that the pool is consistent
|
|
|
|
|
|
|
|
:param corrupter: error generating function (truncate, data-error, or
|
|
|
|
meta-data error, for example).
|
|
|
|
:param chooser: osd type chooser (primary or replica)
|
|
|
|
:param scrub_type: regular scrub or deep-scrub
|
|
|
|
"""
|
2014-11-21 15:13:26 +00:00
|
|
|
pool = "repair_pool_1"
|
|
|
|
ctx.manager.wait_for_clean()
|
|
|
|
with ctx.manager.pool(pool, 1):
|
|
|
|
|
2013-04-24 22:50:28 +00:00
|
|
|
log.info("starting repair test type 1")
|
2014-11-21 15:13:26 +00:00
|
|
|
victim_osd = chooser(ctx, pool, 0)
|
2013-03-27 19:11:04 +00:00
|
|
|
|
|
|
|
# create object
|
|
|
|
log.info("doing put")
|
|
|
|
ctx.manager.do_put(pool, 'repair_test_obj', '/etc/hosts')
|
|
|
|
|
|
|
|
# corrupt object
|
|
|
|
log.info("corrupting object")
|
2014-11-21 15:13:26 +00:00
|
|
|
corrupter(ctx, victim_osd, pool, 'repair_test_obj')
|
2013-03-27 19:11:04 +00:00
|
|
|
|
|
|
|
# verify inconsistent
|
|
|
|
log.info("scrubbing")
|
|
|
|
ctx.manager.do_pg_scrub(pool, 0, scrub_type)
|
|
|
|
|
|
|
|
assert ctx.manager.pg_inconsistent(pool, 0)
|
|
|
|
|
|
|
|
# repair
|
|
|
|
log.info("repairing")
|
|
|
|
ctx.manager.do_pg_scrub(pool, 0, "repair")
|
|
|
|
|
|
|
|
log.info("re-scrubbing")
|
|
|
|
ctx.manager.do_pg_scrub(pool, 0, scrub_type)
|
|
|
|
|
|
|
|
# verify consistent
|
|
|
|
assert not ctx.manager.pg_inconsistent(pool, 0)
|
|
|
|
log.info("done")
|
|
|
|
|
2014-11-21 15:13:26 +00:00
|
|
|
def repair_test_2(ctx, config, chooser):
|
2014-05-13 17:45:09 +00:00
|
|
|
"""
|
2014-11-21 15:13:26 +00:00
|
|
|
First creates a set of objects and
|
2014-05-13 17:45:09 +00:00
|
|
|
sets the omap value. It then corrupts an object, does both a scrub
|
|
|
|
and a deep-scrub, and then corrupts more objects. After that, it
|
|
|
|
repairs the pool and makes sure that the pool is consistent some
|
|
|
|
time after a deep-scrub.
|
|
|
|
|
|
|
|
:param chooser: primary or replica selection routine.
|
|
|
|
"""
|
2014-11-21 15:13:26 +00:00
|
|
|
pool = "repair_pool_2"
|
|
|
|
ctx.manager.wait_for_clean()
|
|
|
|
with ctx.manager.pool(pool, 1):
|
2013-04-24 22:50:28 +00:00
|
|
|
log.info("starting repair test type 2")
|
2014-11-21 15:13:26 +00:00
|
|
|
victim_osd = chooser(ctx, pool, 0)
|
2013-04-24 22:50:28 +00:00
|
|
|
first_mon = teuthology.get_first_mon(ctx, config)
|
2014-03-27 16:35:28 +00:00
|
|
|
(mon,) = ctx.cluster.only(first_mon).remotes.iterkeys()
|
2013-04-24 22:50:28 +00:00
|
|
|
|
|
|
|
# create object
|
|
|
|
log.info("doing put and setomapval")
|
|
|
|
ctx.manager.do_put(pool, 'file1', '/etc/hosts')
|
|
|
|
ctx.manager.do_rados(mon, ['-p', pool, 'setomapval', 'file1', 'key', 'val'])
|
|
|
|
ctx.manager.do_put(pool, 'file2', '/etc/hosts')
|
|
|
|
ctx.manager.do_put(pool, 'file3', '/etc/hosts')
|
|
|
|
ctx.manager.do_put(pool, 'file4', '/etc/hosts')
|
|
|
|
ctx.manager.do_put(pool, 'file5', '/etc/hosts')
|
|
|
|
ctx.manager.do_rados(mon, ['-p', pool, 'setomapval', 'file5', 'key', 'val'])
|
|
|
|
ctx.manager.do_put(pool, 'file6', '/etc/hosts')
|
|
|
|
|
|
|
|
# corrupt object
|
|
|
|
log.info("corrupting object")
|
2014-11-21 15:13:26 +00:00
|
|
|
omaperr(ctx, victim_osd, pool, 'file1')
|
2013-04-24 22:50:28 +00:00
|
|
|
|
|
|
|
# verify inconsistent
|
|
|
|
log.info("scrubbing")
|
|
|
|
ctx.manager.do_pg_scrub(pool, 0, 'deep-scrub')
|
|
|
|
|
|
|
|
assert ctx.manager.pg_inconsistent(pool, 0)
|
|
|
|
|
|
|
|
# Regression test for bug #4778, should still
|
|
|
|
# be inconsistent after scrub
|
|
|
|
ctx.manager.do_pg_scrub(pool, 0, 'scrub')
|
|
|
|
|
|
|
|
assert ctx.manager.pg_inconsistent(pool, 0)
|
|
|
|
|
|
|
|
# Additional corruptions including 2 types for file1
|
|
|
|
log.info("corrupting more objects")
|
2014-11-21 15:13:26 +00:00
|
|
|
dataerr(ctx, victim_osd, pool, 'file1')
|
|
|
|
mdataerr(ctx, victim_osd, pool, 'file2')
|
|
|
|
trunc(ctx, victim_osd, pool, 'file3')
|
|
|
|
omaperr(ctx, victim_osd, pool, 'file6')
|
2013-04-24 22:50:28 +00:00
|
|
|
|
|
|
|
# see still inconsistent
|
|
|
|
log.info("scrubbing")
|
|
|
|
ctx.manager.do_pg_scrub(pool, 0, 'deep-scrub')
|
|
|
|
|
|
|
|
assert ctx.manager.pg_inconsistent(pool, 0)
|
|
|
|
|
|
|
|
# repair
|
|
|
|
log.info("repairing")
|
|
|
|
ctx.manager.do_pg_scrub(pool, 0, "repair")
|
|
|
|
|
|
|
|
# Let repair clear inconsistent flag
|
|
|
|
time.sleep(10)
|
|
|
|
|
|
|
|
# verify consistent
|
|
|
|
assert not ctx.manager.pg_inconsistent(pool, 0)
|
|
|
|
|
|
|
|
# In the future repair might determine state of
|
|
|
|
# inconsistency itself, verify with a deep-scrub
|
|
|
|
log.info("scrubbing")
|
|
|
|
ctx.manager.do_pg_scrub(pool, 0, 'deep-scrub')
|
|
|
|
|
|
|
|
# verify consistent
|
|
|
|
assert not ctx.manager.pg_inconsistent(pool, 0)
|
|
|
|
|
|
|
|
log.info("done")
|
2014-11-21 15:13:26 +00:00
|
|
|
|
2013-04-24 22:50:28 +00:00
|
|
|
|
2013-03-27 19:11:04 +00:00
|
|
|
def task(ctx, config):
|
|
|
|
"""
|
|
|
|
Test [deep] repair in several situations:
|
|
|
|
Repair [Truncate, Data EIO, MData EIO] on [Primary|Replica]
|
2013-04-24 22:50:28 +00:00
|
|
|
|
|
|
|
The config should be as follows:
|
|
|
|
|
|
|
|
Must include the log-whitelist below
|
|
|
|
Must enable filestore_debug_inject_read_err config
|
|
|
|
|
|
|
|
example:
|
|
|
|
|
|
|
|
tasks:
|
|
|
|
- chef:
|
|
|
|
- install:
|
|
|
|
- ceph:
|
|
|
|
log-whitelist: ['candidate had a read error', 'deep-scrub 0 missing, 1 inconsistent objects', 'deep-scrub 0 missing, 4 inconsistent objects', 'deep-scrub 1 errors', 'deep-scrub 4 errors', '!= known omap_digest', 'repair 0 missing, 1 inconsistent objects', 'repair 0 missing, 4 inconsistent objects', 'repair 1 errors, 1 fixed', 'repair 4 errors, 4 fixed', 'scrub 0 missing, 1 inconsistent', 'scrub 1 errors', 'size 1 != known size']
|
|
|
|
conf:
|
|
|
|
osd:
|
|
|
|
filestore debug inject read err: true
|
|
|
|
- repair_test:
|
|
|
|
|
2013-03-27 19:11:04 +00:00
|
|
|
"""
|
|
|
|
if config is None:
|
|
|
|
config = {}
|
|
|
|
assert isinstance(config, dict), \
|
|
|
|
'repair_test task only accepts a dict for config'
|
|
|
|
|
|
|
|
if not hasattr(ctx, 'manager'):
|
|
|
|
first_mon = teuthology.get_first_mon(ctx, config)
|
2014-03-27 16:35:28 +00:00
|
|
|
(mon,) = ctx.cluster.only(first_mon).remotes.iterkeys()
|
2013-03-27 19:11:04 +00:00
|
|
|
ctx.manager = ceph_manager.CephManager(
|
|
|
|
mon,
|
|
|
|
ctx=ctx,
|
|
|
|
logger=log.getChild('ceph_manager')
|
|
|
|
)
|
|
|
|
|
2014-11-21 15:13:26 +00:00
|
|
|
ctx.manager.wait_for_all_up()
|
2014-10-25 00:02:41 +00:00
|
|
|
|
2014-11-21 15:13:26 +00:00
|
|
|
repair_test_1(ctx, mdataerr, choose_primary, "scrub")
|
|
|
|
repair_test_1(ctx, mdataerr, choose_replica, "scrub")
|
|
|
|
repair_test_1(ctx, dataerr, choose_primary, "deep-scrub")
|
|
|
|
repair_test_1(ctx, dataerr, choose_replica, "deep-scrub")
|
|
|
|
repair_test_1(ctx, trunc, choose_primary, "scrub")
|
|
|
|
repair_test_1(ctx, trunc, choose_replica, "scrub")
|
|
|
|
repair_test_2(ctx, config, choose_primary)
|
|
|
|
repair_test_2(ctx, config, choose_replica)
|