2011-06-13 23:36:21 +00:00
|
|
|
import random
|
|
|
|
import time
|
|
|
|
import re
|
|
|
|
import gevent
|
2011-09-13 21:53:02 +00:00
|
|
|
from ..orchestra import run
|
2011-06-13 23:36:21 +00:00
|
|
|
|
|
|
|
class Thrasher(gevent.Greenlet):
|
2011-08-25 22:18:42 +00:00
|
|
|
def __init__(self, manager, config, logger=None):
|
2011-06-13 23:36:21 +00:00
|
|
|
self.ceph_manager = manager
|
|
|
|
self.ceph_manager.wait_till_clean()
|
|
|
|
osd_status = self.ceph_manager.get_osd_status()
|
|
|
|
self.in_osds = osd_status['in']
|
2011-09-14 23:31:58 +00:00
|
|
|
self.live_osds = osd_status['live']
|
2011-06-13 23:36:21 +00:00
|
|
|
self.out_osds = osd_status['out']
|
2011-09-14 23:31:58 +00:00
|
|
|
self.dead_osds = osd_status['dead']
|
2011-06-13 23:36:21 +00:00
|
|
|
self.stopping = False
|
|
|
|
self.logger = logger
|
2011-08-25 22:18:42 +00:00
|
|
|
self.config = config
|
2011-09-08 19:54:23 +00:00
|
|
|
if self.logger is not None:
|
2011-06-13 23:36:21 +00:00
|
|
|
self.log = lambda x: self.logger.info(x)
|
|
|
|
else:
|
|
|
|
def tmp(x):
|
|
|
|
print x
|
|
|
|
self.log = tmp
|
2011-08-25 22:18:42 +00:00
|
|
|
if self.config is None:
|
|
|
|
self.config = dict()
|
2011-06-13 23:36:21 +00:00
|
|
|
gevent.Greenlet.__init__(self, self.do_thrash)
|
|
|
|
self.start()
|
|
|
|
|
2011-09-14 23:31:58 +00:00
|
|
|
def kill_osd(self, osd=None):
|
|
|
|
if osd is None:
|
|
|
|
osd = random.choice(self.live_osds)
|
|
|
|
self.log("Killing osd %s, live_osds are %s"%(str(osd),str(self.live_osds)))
|
|
|
|
self.live_osds.remove(osd)
|
|
|
|
self.dead_osds.append(osd)
|
|
|
|
if osd in self.in_osds:
|
2011-08-25 22:19:30 +00:00
|
|
|
self.in_osds.remove(osd)
|
|
|
|
self.out_osds.append(osd)
|
2011-09-14 23:31:58 +00:00
|
|
|
self.ceph_manager.kill_osd(osd)
|
|
|
|
|
|
|
|
def revive_osd(self, osd=None):
|
|
|
|
if osd is None:
|
|
|
|
osd = random.choice(self.dead_osds)
|
|
|
|
self.log("Reviving osd %s"%(str(osd),))
|
|
|
|
self.live_osds.append(osd)
|
|
|
|
self.dead_osds.remove(osd)
|
|
|
|
if osd in self.out_osds:
|
|
|
|
self.in_osds.append(osd)
|
|
|
|
self.out_osds.remove(osd)
|
|
|
|
self.ceph_manager.revive_osd(osd)
|
|
|
|
|
|
|
|
def out_osd(self, osd=None):
|
|
|
|
if osd is None:
|
|
|
|
osd = random.choice(self.in_osds)
|
|
|
|
self.log("Removing osd %s, in_osds are: %s"%(str(osd),str(self.in_osds)))
|
|
|
|
self.ceph_manager.mark_out_osd(osd)
|
|
|
|
self.in_osds.remove(osd)
|
|
|
|
self.out_osds.append(osd)
|
|
|
|
|
|
|
|
def in_osd(self, osd=None):
|
|
|
|
if osd is None:
|
|
|
|
osd = random.choice(self.out_osds)
|
|
|
|
if osd in self.dead_osds:
|
|
|
|
return self.revive_osd(osd)
|
2011-06-13 23:36:21 +00:00
|
|
|
self.log("Adding osd %s"%(str(osd),))
|
|
|
|
self.out_osds.remove(osd)
|
|
|
|
self.in_osds.append(osd)
|
|
|
|
self.ceph_manager.mark_in_osd(osd)
|
|
|
|
|
|
|
|
def all_up(self):
|
2011-09-14 23:31:58 +00:00
|
|
|
while len(self.dead_osds) > 0:
|
|
|
|
self.revive_osd()
|
2011-06-13 23:36:21 +00:00
|
|
|
while len(self.out_osds) > 0:
|
2011-09-14 23:31:58 +00:00
|
|
|
self.in_osd()
|
2011-06-13 23:36:21 +00:00
|
|
|
|
|
|
|
def do_join(self):
|
|
|
|
self.stopping = True
|
|
|
|
self.get()
|
|
|
|
|
2011-09-14 23:31:58 +00:00
|
|
|
def choose_action(self):
|
|
|
|
chance_down = self.config.get("chance_down", 0)
|
|
|
|
if isinstance(chance_down, int):
|
|
|
|
chance_down = float(chance_down) / 100
|
|
|
|
minin = self.config.get("min_in", 2)
|
|
|
|
minout = self.config.get("min_out", 0)
|
|
|
|
minlive = self.config.get("min_live", 2)
|
|
|
|
mindead = self.config.get("min_dead", 2)
|
|
|
|
|
|
|
|
actions = []
|
|
|
|
if len(self.in_osds) > minin:
|
|
|
|
actions.append((self.out_osd, 1.0,))
|
|
|
|
if len(self.live_osds) > minlive and chance_down > 0:
|
|
|
|
actions.append((self.kill_osd, chance_down))
|
|
|
|
if len(self.out_osds) > minout:
|
|
|
|
actions.append((self.in_osd, 1.0,))
|
|
|
|
if len(self.dead_osds) > mindead:
|
|
|
|
actions.append((self.revive_osd, 1.0))
|
|
|
|
|
|
|
|
total = sum([y for (x,y) in actions])
|
|
|
|
rev_cum = reduce(lambda l,(y1,y2): l+[(y1, (l[-1][1]+y2)/total)], actions, [(0, 0)])[1:]
|
|
|
|
rev_cum.reverse()
|
|
|
|
final_rev_cum = [(x, y-rev_cum[-1][1]) for (x,y) in rev_cum]
|
|
|
|
val = random.uniform(0, 1)
|
|
|
|
for (action, prob) in final_rev_cum:
|
|
|
|
if (prob < val):
|
|
|
|
return action
|
|
|
|
return None
|
|
|
|
|
2011-06-13 23:36:21 +00:00
|
|
|
def do_thrash(self):
|
2011-09-08 19:54:23 +00:00
|
|
|
cleanint = self.config.get("clean_interval", 60)
|
|
|
|
delay = self.config.get("op_delay", 5)
|
2011-06-13 23:36:21 +00:00
|
|
|
self.log("starting do_thrash")
|
|
|
|
while not self.stopping:
|
2011-09-14 23:31:58 +00:00
|
|
|
self.log(" ".join([str(x) for x in ["in_osds: ", self.in_osds, " out_osds: ", self.out_osds,
|
|
|
|
"dead_osds: ", self.dead_osds, "live_osds: ",
|
|
|
|
self.live_osds]]))
|
2011-09-08 19:54:23 +00:00
|
|
|
if random.uniform(0,1) < (float(delay) / cleanint):
|
2011-08-25 21:38:34 +00:00
|
|
|
self.ceph_manager.wait_till_clean()
|
2011-09-14 23:31:58 +00:00
|
|
|
self.choose_action()()
|
2011-08-25 22:18:42 +00:00
|
|
|
time.sleep(delay)
|
2011-09-08 21:07:23 +00:00
|
|
|
self.all_up()
|
2011-06-13 23:36:21 +00:00
|
|
|
|
|
|
|
class CephManager:
|
2011-09-14 23:31:58 +00:00
|
|
|
def __init__(self, controller, ctx=None, logger=None):
|
|
|
|
self.ctx = ctx
|
2011-06-13 23:36:21 +00:00
|
|
|
self.controller = controller
|
|
|
|
if (logger):
|
|
|
|
self.log = lambda x: logger.info(x)
|
|
|
|
else:
|
|
|
|
def tmp(x):
|
|
|
|
print x
|
|
|
|
self.log = tmp
|
|
|
|
|
|
|
|
def raw_cluster_cmd(self, suffix):
|
|
|
|
proc = self.controller.run(
|
|
|
|
args=[
|
|
|
|
"/bin/sh", "-c",
|
|
|
|
" ".join([
|
|
|
|
"LD_LIBRARY_PRELOAD=/tmp/cephtest/binary/usr/local/lib",
|
2011-09-08 21:09:13 +00:00
|
|
|
'/tmp/cephtest/enable-coredump',
|
|
|
|
'/tmp/cephtest/binary/usr/local/bin/ceph-coverage',
|
|
|
|
'/tmp/cephtest/archive/coverage',
|
2011-06-13 23:36:21 +00:00
|
|
|
"/tmp/cephtest/binary/usr/local/bin/ceph -k /tmp/cephtest/ceph.keyring -c "+\
|
|
|
|
"/tmp/cephtest/ceph.conf " + suffix
|
|
|
|
])
|
|
|
|
],
|
|
|
|
stdout=run.PIPE,
|
|
|
|
wait=False
|
|
|
|
)
|
2011-07-12 01:00:03 +00:00
|
|
|
|
2011-06-13 23:36:21 +00:00
|
|
|
out = ""
|
|
|
|
tmp = proc.stdout.read(1)
|
|
|
|
while tmp:
|
|
|
|
out += tmp
|
|
|
|
tmp = proc.stdout.read(1)
|
|
|
|
return out
|
|
|
|
|
|
|
|
def raw_cluster_status(self):
|
|
|
|
return self.raw_cluster_cmd("-s")
|
2011-07-12 01:00:03 +00:00
|
|
|
|
2011-06-13 23:36:21 +00:00
|
|
|
def raw_osd_status(self):
|
|
|
|
return self.raw_cluster_cmd("osd dump -o -")
|
|
|
|
|
|
|
|
def raw_pg_status(self):
|
|
|
|
return self.controller.do_ssh("pg dump -o -")
|
|
|
|
|
|
|
|
def get_osd_status(self):
|
|
|
|
osd_lines = filter(
|
|
|
|
lambda x: x[:3] == 'osd' and (("up" in x) or ("down" in x)),
|
|
|
|
self.raw_osd_status().split('\n'))
|
|
|
|
self.log(osd_lines)
|
|
|
|
in_osds = [int(i[3:].split()[0]) for i in filter(
|
|
|
|
lambda x: " in " in x,
|
|
|
|
osd_lines)]
|
|
|
|
out_osds = [int(i[3:].split()[0]) for i in filter(
|
|
|
|
lambda x: " out " in x,
|
|
|
|
osd_lines)]
|
|
|
|
up_osds = [int(i[3:].split()[0]) for i in filter(
|
|
|
|
lambda x: " up " in x,
|
|
|
|
osd_lines)]
|
|
|
|
down_osds = [int(i[3:].split()[0]) for i in filter(
|
|
|
|
lambda x: " down " in x,
|
|
|
|
osd_lines)]
|
2011-09-14 23:31:58 +00:00
|
|
|
dead_osds = [int(x.id_) for x in
|
|
|
|
filter(lambda x: not x.running(), self.ctx.daemons.iter_daemons_of_role('osd'))]
|
|
|
|
live_osds = [int(x.id_) for x in
|
|
|
|
filter(lambda x: x.running(), self.ctx.daemons.iter_daemons_of_role('osd'))]
|
2011-07-12 01:00:03 +00:00
|
|
|
return { 'in' : in_osds, 'out' : out_osds, 'up' : up_osds,
|
2011-09-14 23:31:58 +00:00
|
|
|
'down' : down_osds, 'dead' : dead_osds, 'live' : live_osds, 'raw' : osd_lines }
|
2011-06-13 23:36:21 +00:00
|
|
|
|
|
|
|
def get_num_pgs(self):
|
|
|
|
status = self.raw_cluster_status()
|
|
|
|
return int(re.search(
|
|
|
|
"\d* pgs:",
|
|
|
|
status).group(0).split()[0])
|
|
|
|
|
|
|
|
def get_num_active_clean(self):
|
|
|
|
status = self.raw_cluster_status()
|
|
|
|
self.log(status)
|
|
|
|
match = re.search(
|
|
|
|
"\d* active.clean",
|
|
|
|
status)
|
|
|
|
if match == None:
|
|
|
|
return 0
|
|
|
|
else:
|
|
|
|
return int(match.group(0).split()[0])
|
|
|
|
|
|
|
|
def is_clean(self):
|
|
|
|
return self.get_num_active_clean() == self.get_num_pgs()
|
|
|
|
|
2011-09-09 01:09:11 +00:00
|
|
|
def wait_till_clean(self, timeout=None):
|
2011-06-13 23:36:21 +00:00
|
|
|
self.log("waiting till clean")
|
2011-09-09 01:09:11 +00:00
|
|
|
start = time.time()
|
2011-06-13 23:36:21 +00:00
|
|
|
while not self.is_clean():
|
2011-09-09 01:09:11 +00:00
|
|
|
if timeout is not None:
|
|
|
|
assert time.time() - start < timeout, \
|
|
|
|
'failed to become clean before timeout expired'
|
2011-06-13 23:36:21 +00:00
|
|
|
time.sleep(3)
|
|
|
|
self.log("clean!")
|
|
|
|
|
|
|
|
def mark_out_osd(self, osd):
|
|
|
|
self.raw_cluster_cmd("osd out %s"%(str(osd,)))
|
|
|
|
|
2011-09-14 23:31:58 +00:00
|
|
|
def kill_osd(self, osd):
|
|
|
|
self.ctx.daemons.get_daemon('osd', osd).stop()
|
|
|
|
|
|
|
|
def revive_osd(self, osd):
|
|
|
|
self.ctx.daemons.get_daemon('osd', osd).restart()
|
|
|
|
|
2011-08-25 22:19:30 +00:00
|
|
|
def mark_down_osd(self, osd):
|
|
|
|
self.raw_cluster_cmd("osd down %s"%(str(osd,)))
|
|
|
|
|
2011-06-13 23:36:21 +00:00
|
|
|
def mark_in_osd(self, osd):
|
|
|
|
self.raw_cluster_cmd("osd in %s"%(str(osd,)))
|