mirror of
https://github.com/ceph/ceph
synced 2025-04-01 23:02:17 +00:00
ceph_manager: Add test code to use export/import to move a pg
Check for more than 1 osd down and randomize on chance_move_pg (100%) For now only export from older down osd to newly down osd to avoid missing map Signed-off-by: David Zafman <david.zafman@inktank.com>
This commit is contained in:
parent
0cdf6e813d
commit
05eee9fa79
@ -11,6 +11,7 @@ import os
|
|||||||
from teuthology import misc as teuthology
|
from teuthology import misc as teuthology
|
||||||
from tasks.scrub import Scrubber
|
from tasks.scrub import Scrubber
|
||||||
from util.rados import cmd_erasure_code_profile
|
from util.rados import cmd_erasure_code_profile
|
||||||
|
from teuthology.orchestra.remote import Remote
|
||||||
|
|
||||||
def make_admin_daemon_dir(ctx, remote):
|
def make_admin_daemon_dir(ctx, remote):
|
||||||
"""
|
"""
|
||||||
@ -77,6 +78,7 @@ class Thrasher:
|
|||||||
self.clean_wait = self.config.get('clean_wait', 0)
|
self.clean_wait = self.config.get('clean_wait', 0)
|
||||||
self.minin = self.config.get("min_in", 3)
|
self.minin = self.config.get("min_in", 3)
|
||||||
self.ceph_objectstore_tool = self.config.get('ceph_objectstore_tool', True)
|
self.ceph_objectstore_tool = self.config.get('ceph_objectstore_tool', True)
|
||||||
|
self.chance_move_pg = self.config.get('chance_move_pg', 1.0)
|
||||||
|
|
||||||
num_osds = self.in_osds + self.out_osds
|
num_osds = self.in_osds + self.out_osds
|
||||||
self.max_pgs = self.config.get("max_pgs_per_pool_osd", 1200) * num_osds
|
self.max_pgs = self.config.get("max_pgs_per_pool_osd", 1200) * num_osds
|
||||||
@ -122,36 +124,62 @@ class Thrasher:
|
|||||||
(remote,) = self.ceph_manager.ctx.cluster.only('osd.{o}'.format(o=osd)).remotes.iterkeys()
|
(remote,) = self.ceph_manager.ctx.cluster.only('osd.{o}'.format(o=osd)).remotes.iterkeys()
|
||||||
FSPATH = self.ceph_manager.get_filepath()
|
FSPATH = self.ceph_manager.get_filepath()
|
||||||
JPATH = os.path.join(FSPATH, "journal")
|
JPATH = os.path.join(FSPATH, "journal")
|
||||||
|
exp_osd = imp_osd = osd
|
||||||
|
exp_remote = imp_remote = remote
|
||||||
|
# If an older osd is available we'll move a pg from there
|
||||||
|
if len(self.dead_osds) > 1 and random.random() < self.chance_move_pg:
|
||||||
|
exp_osd = random.choice(self.dead_osds[:-1])
|
||||||
|
(exp_remote,) = self.ceph_manager.ctx.cluster.only('osd.{o}'.format(o=exp_osd)).remotes.iterkeys()
|
||||||
prefix = "sudo ceph_objectstore_tool --data-path {fpath} --journal-path {jpath} ".format(fpath=FSPATH, jpath=JPATH)
|
prefix = "sudo ceph_objectstore_tool --data-path {fpath} --journal-path {jpath} ".format(fpath=FSPATH, jpath=JPATH)
|
||||||
cmd = (prefix + "--op list-pgs").format(id=osd)
|
cmd = (prefix + "--op list-pgs").format(id=exp_osd)
|
||||||
proc = remote.run(args=cmd, wait=True, check_status=True, stdout=StringIO())
|
proc = exp_remote.run(args=cmd, wait=True, check_status=True, stdout=StringIO())
|
||||||
if proc.exitstatus != 0:
|
if proc.exitstatus:
|
||||||
self.log("Failed to get pg list for osd.{osd}".format(osd=osd))
|
raise Exception("ceph_objectstore_tool: exp list-pgs failure with status {ret}".format(ret=proc.exitstatus))
|
||||||
return
|
|
||||||
pgs = proc.stdout.getvalue().split('\n')[:-1]
|
pgs = proc.stdout.getvalue().split('\n')[:-1]
|
||||||
if len(pgs) == 0:
|
if len(pgs) == 0:
|
||||||
self.log("No PGs found for osd.{osd}".format(osd=osd))
|
self.log("No PGs found for osd.{osd}".format(osd=exp_osd))
|
||||||
return
|
return
|
||||||
pg = random.choice(pgs)
|
pg = random.choice(pgs)
|
||||||
fpath = os.path.join(os.path.join(teuthology.get_testdir(self.ceph_manager.ctx), "data"), "exp.{pg}.{id}".format(pg=pg,id=osd))
|
exp_path = os.path.join(os.path.join(teuthology.get_testdir(self.ceph_manager.ctx), "data"), "exp.{pg}.{id}".format(pg=pg, id=exp_osd))
|
||||||
# export
|
# export
|
||||||
success = False
|
cmd = (prefix + "--op export --pgid {pg} --file {file}").format(id=exp_osd, pg=pg, file=exp_path)
|
||||||
cmd = (prefix + "--op export --pgid {pg} --file {file}").format(id=osd, pg=pg, file=fpath)
|
proc = exp_remote.run(args=cmd)
|
||||||
proc = remote.run(args=cmd)
|
if proc.exitstatus:
|
||||||
if proc.exitstatus == 0:
|
raise Exception("ceph_objectstore_tool: export failure with status {ret}".format(ret=proc.exitstatus))
|
||||||
# remove
|
# remove
|
||||||
cmd = (prefix + "--op remove --pgid {pg}").format(id=osd, pg=pg)
|
cmd = (prefix + "--op remove --pgid {pg}").format(id=exp_osd, pg=pg)
|
||||||
proc = remote.run(args=cmd)
|
proc = exp_remote.run(args=cmd)
|
||||||
if proc.exitstatus == 0:
|
if proc.exitstatus:
|
||||||
# import
|
raise Exception("ceph_objectstore_tool: remove failure with status {ret}".format(ret=proc.exitstatus))
|
||||||
cmd = (prefix + "--op import --file {file}").format(id=osd, file=fpath)
|
# If there are at least 2 dead osds we might move the pg
|
||||||
remote.run(args=cmd)
|
if exp_osd != imp_osd:
|
||||||
if proc.exitstatus == 0:
|
# If pg isn't already on this osd, then we will move it there
|
||||||
success = True
|
cmd = (prefix + "--op list-pgs").format(id=imp_osd)
|
||||||
cmd = "rm -f {file}".format(file=fpath)
|
proc = imp_remote.run(args=cmd, wait=True, check_status=True, stdout=StringIO())
|
||||||
remote.run(args=cmd)
|
if proc.exitstatus:
|
||||||
if not success:
|
raise Exception("ceph_objectstore_tool: imp list-pgs failure with status {ret}".format(ret=proc.exitstatus))
|
||||||
raise Exception("ceph_objectstore_tool: failure with status {ret}".format(ret=proc.exitstatus))
|
pgs = proc.stdout.getvalue().split('\n')[:-1]
|
||||||
|
if pg not in pgs:
|
||||||
|
self.log("Moving pg {pg} from osd.{fosd} to osd.{tosd}".format(pg=pg, fosd=exp_osd, tosd=imp_osd))
|
||||||
|
if imp_remote != exp_remote:
|
||||||
|
# Copy export file to the other machine
|
||||||
|
self.log("Transfer export file from {srem} to {trem}".format(srem=exp_remote, trem=imp_remote))
|
||||||
|
tmpexport = Remote.get_file(exp_remote, exp_path)
|
||||||
|
Remote.put_file(imp_remote, tmpexport, exp_path)
|
||||||
|
os.remove(tmpexport)
|
||||||
|
else:
|
||||||
|
# Can't move the pg after all
|
||||||
|
imp_osd = exp_osd
|
||||||
|
imp_remote = exp_remote
|
||||||
|
# import
|
||||||
|
cmd = (prefix + "--op import --file {file}").format(id=imp_osd, file=exp_path)
|
||||||
|
imp_remote.run(args=cmd)
|
||||||
|
if proc.exitstatus:
|
||||||
|
raise Exception("ceph_objectstore_tool: import failure with status {ret}".format(ret=proc.exitstatus))
|
||||||
|
cmd = "rm -f {file}".format(file=exp_path)
|
||||||
|
exp_remote.run(args=cmd)
|
||||||
|
if imp_remote != exp_remote:
|
||||||
|
imp_remote.run(args=cmd)
|
||||||
|
|
||||||
|
|
||||||
def blackhole_kill_osd(self, osd=None):
|
def blackhole_kill_osd(self, osd=None):
|
||||||
|
@ -95,6 +95,7 @@ def task(ctx, config):
|
|||||||
map_discontinuity_sleep_time: (40) time to wait for map trims
|
map_discontinuity_sleep_time: (40) time to wait for map trims
|
||||||
|
|
||||||
ceph_objectstore_tool: (true) whether to export/import a pg while an osd is down
|
ceph_objectstore_tool: (true) whether to export/import a pg while an osd is down
|
||||||
|
chance_move_pg: (1.0) chance of moving a pg if more than 1 osd is down (default 100%)
|
||||||
|
|
||||||
example:
|
example:
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user