ceph_manager: Implement export/import when thrasher kills an osd

Use list-pgs to avoid races by seeing actual pgs present

Signed-off-by: David Zafman <david.zafman@inktank.com>
This commit is contained in:
David Zafman 2014-08-04 13:07:19 -07:00 committed by David Zafman
parent 9ade22dd34
commit 0cdf6e813d
2 changed files with 46 additions and 0 deletions

View File

@ -7,6 +7,7 @@ import time
import gevent import gevent
import json import json
import threading import threading
import os
from teuthology import misc as teuthology from teuthology import misc as teuthology
from tasks.scrub import Scrubber from tasks.scrub import Scrubber
from util.rados import cmd_erasure_code_profile from util.rados import cmd_erasure_code_profile
@ -75,6 +76,7 @@ class Thrasher:
self.revive_timeout += 120 self.revive_timeout += 120
self.clean_wait = self.config.get('clean_wait', 0) self.clean_wait = self.config.get('clean_wait', 0)
self.minin = self.config.get("min_in", 3) self.minin = self.config.get("min_in", 3)
self.ceph_objectstore_tool = self.config.get('ceph_objectstore_tool', True)
num_osds = self.in_osds + self.out_osds num_osds = self.in_osds + self.out_osds
self.max_pgs = self.config.get("max_pgs_per_pool_osd", 1200) * num_osds self.max_pgs = self.config.get("max_pgs_per_pool_osd", 1200) * num_osds
@ -115,6 +117,42 @@ class Thrasher:
self.ceph_manager.mark_down_osd(osd) self.ceph_manager.mark_down_osd(osd)
if mark_out and osd in self.in_osds: if mark_out and osd in self.in_osds:
self.out_osd(osd) self.out_osd(osd)
if self.ceph_objectstore_tool:
self.log("Testing ceph_objectstore_tool on down osd")
(remote,) = self.ceph_manager.ctx.cluster.only('osd.{o}'.format(o=osd)).remotes.iterkeys()
FSPATH = self.ceph_manager.get_filepath()
JPATH = os.path.join(FSPATH, "journal")
prefix = "sudo ceph_objectstore_tool --data-path {fpath} --journal-path {jpath} ".format(fpath=FSPATH, jpath=JPATH)
cmd = (prefix + "--op list-pgs").format(id=osd)
proc = remote.run(args=cmd, wait=True, check_status=True, stdout=StringIO())
if proc.exitstatus != 0:
self.log("Failed to get pg list for osd.{osd}".format(osd=osd))
return
pgs = proc.stdout.getvalue().split('\n')[:-1]
if len(pgs) == 0:
self.log("No PGs found for osd.{osd}".format(osd=osd))
return
pg = random.choice(pgs)
fpath = os.path.join(os.path.join(teuthology.get_testdir(self.ceph_manager.ctx), "data"), "exp.{pg}.{id}".format(pg=pg,id=osd))
# export
success = False
cmd = (prefix + "--op export --pgid {pg} --file {file}").format(id=osd, pg=pg, file=fpath)
proc = remote.run(args=cmd)
if proc.exitstatus == 0:
# remove
cmd = (prefix + "--op remove --pgid {pg}").format(id=osd, pg=pg)
proc = remote.run(args=cmd)
if proc.exitstatus == 0:
# import
cmd = (prefix + "--op import --file {file}").format(id=osd, file=fpath)
remote.run(args=cmd)
if proc.exitstatus == 0:
success = True
cmd = "rm -f {file}".format(file=fpath)
remote.run(args=cmd)
if not success:
raise Exception("ceph_objectstore_tool: failure with status {ret}".format(ret=proc.exitstatus))
def blackhole_kill_osd(self, osd=None): def blackhole_kill_osd(self, osd=None):
""" """
@ -1487,3 +1525,9 @@ class CephManager:
out = self.raw_cluster_cmd('mds', 'dump', '--format=json') out = self.raw_cluster_cmd('mds', 'dump', '--format=json')
j = json.loads(' '.join(out.splitlines()[1:])) j = json.loads(' '.join(out.splitlines()[1:]))
return j return j
def get_filepath(self):
"""
Return path to osd data with {id} needing to be replaced
"""
return "/var/lib/ceph/osd/ceph-{id}"

View File

@ -94,6 +94,8 @@ def task(ctx, config):
chance_test_map_discontinuity: (0) chance to test map discontinuity chance_test_map_discontinuity: (0) chance to test map discontinuity
map_discontinuity_sleep_time: (40) time to wait for map trims map_discontinuity_sleep_time: (40) time to wait for map trims
ceph_objectstore_tool: (true) whether to export/import a pg while an osd is down
example: example:
tasks: tasks: