ceph_manager: Implement export/import when thrasher kills an osd

Use list-pgs to avoid races by seeing actual pgs present

Signed-off-by: David Zafman <david.zafman@inktank.com>
This commit is contained in:
David Zafman 2014-08-04 13:07:19 -07:00 committed by David Zafman
parent 9ade22dd34
commit 0cdf6e813d
2 changed files with 46 additions and 0 deletions

View File

@ -7,6 +7,7 @@ import time
import gevent
import json
import threading
import os
from teuthology import misc as teuthology
from tasks.scrub import Scrubber
from util.rados import cmd_erasure_code_profile
@ -75,6 +76,7 @@ class Thrasher:
self.revive_timeout += 120
self.clean_wait = self.config.get('clean_wait', 0)
self.minin = self.config.get("min_in", 3)
self.ceph_objectstore_tool = self.config.get('ceph_objectstore_tool', True)
num_osds = self.in_osds + self.out_osds
self.max_pgs = self.config.get("max_pgs_per_pool_osd", 1200) * num_osds
@ -115,6 +117,42 @@ class Thrasher:
self.ceph_manager.mark_down_osd(osd)
if mark_out and osd in self.in_osds:
self.out_osd(osd)
if self.ceph_objectstore_tool:
self.log("Testing ceph_objectstore_tool on down osd")
(remote,) = self.ceph_manager.ctx.cluster.only('osd.{o}'.format(o=osd)).remotes.iterkeys()
FSPATH = self.ceph_manager.get_filepath()
JPATH = os.path.join(FSPATH, "journal")
prefix = "sudo ceph_objectstore_tool --data-path {fpath} --journal-path {jpath} ".format(fpath=FSPATH, jpath=JPATH)
cmd = (prefix + "--op list-pgs").format(id=osd)
proc = remote.run(args=cmd, wait=True, check_status=True, stdout=StringIO())
if proc.exitstatus != 0:
self.log("Failed to get pg list for osd.{osd}".format(osd=osd))
return
pgs = proc.stdout.getvalue().split('\n')[:-1]
if len(pgs) == 0:
self.log("No PGs found for osd.{osd}".format(osd=osd))
return
pg = random.choice(pgs)
fpath = os.path.join(os.path.join(teuthology.get_testdir(self.ceph_manager.ctx), "data"), "exp.{pg}.{id}".format(pg=pg,id=osd))
# export
success = False
cmd = (prefix + "--op export --pgid {pg} --file {file}").format(id=osd, pg=pg, file=fpath)
proc = remote.run(args=cmd)
if proc.exitstatus == 0:
# remove
cmd = (prefix + "--op remove --pgid {pg}").format(id=osd, pg=pg)
proc = remote.run(args=cmd)
if proc.exitstatus == 0:
# import
cmd = (prefix + "--op import --file {file}").format(id=osd, file=fpath)
remote.run(args=cmd)
if proc.exitstatus == 0:
success = True
cmd = "rm -f {file}".format(file=fpath)
remote.run(args=cmd)
if not success:
raise Exception("ceph_objectstore_tool: failure with status {ret}".format(ret=proc.exitstatus))
def blackhole_kill_osd(self, osd=None):
"""
@ -1487,3 +1525,9 @@ class CephManager:
out = self.raw_cluster_cmd('mds', 'dump', '--format=json')
j = json.loads(' '.join(out.splitlines()[1:]))
return j
def get_filepath(self):
"""
Return path to osd data with {id} needing to be replaced
"""
return "/var/lib/ceph/osd/ceph-{id}"

View File

@ -94,6 +94,8 @@ def task(ctx, config):
chance_test_map_discontinuity: (0) chance to test map discontinuity
map_discontinuity_sleep_time: (40) time to wait for map trims
ceph_objectstore_tool: (true) whether to export/import a pg while an osd is down
example:
tasks: