2011-10-04 00:49:13 +00:00
|
|
|
from cStringIO import StringIO
|
2011-06-13 23:36:21 +00:00
|
|
|
import random
|
|
|
|
import time
|
|
|
|
import gevent
|
2011-10-17 22:32:22 +00:00
|
|
|
import json
|
2012-12-11 22:21:48 +00:00
|
|
|
import threading
|
|
|
|
from teuthology import misc as teuthology
|
2013-01-23 02:13:19 +00:00
|
|
|
from teuthology.task import ceph as ceph_task
|
2011-06-13 23:36:21 +00:00
|
|
|
|
2012-03-30 01:07:30 +00:00
|
|
|
class Thrasher:
|
2011-08-25 22:18:42 +00:00
|
|
|
def __init__(self, manager, config, logger=None):
|
2011-06-13 23:36:21 +00:00
|
|
|
self.ceph_manager = manager
|
2012-02-18 05:53:25 +00:00
|
|
|
self.ceph_manager.wait_for_clean()
|
2011-06-13 23:36:21 +00:00
|
|
|
osd_status = self.ceph_manager.get_osd_status()
|
|
|
|
self.in_osds = osd_status['in']
|
2011-09-14 23:31:58 +00:00
|
|
|
self.live_osds = osd_status['live']
|
2011-06-13 23:36:21 +00:00
|
|
|
self.out_osds = osd_status['out']
|
2011-09-14 23:31:58 +00:00
|
|
|
self.dead_osds = osd_status['dead']
|
2011-06-13 23:36:21 +00:00
|
|
|
self.stopping = False
|
|
|
|
self.logger = logger
|
2011-08-25 22:18:42 +00:00
|
|
|
self.config = config
|
2013-05-06 21:10:11 +00:00
|
|
|
self.revive_timeout = self.config.get("revive_timeout", 75)
|
|
|
|
if self.config.get('powercycle'):
|
|
|
|
self.revive_timeout += 120
|
2013-07-22 23:24:41 +00:00
|
|
|
self.clean_wait = self.config.get('clean_wait', 0)
|
2013-07-26 02:43:08 +00:00
|
|
|
self.minin = self.config.get("min_in", 2)
|
2013-05-06 21:10:11 +00:00
|
|
|
|
2012-12-11 22:21:48 +00:00
|
|
|
num_osds = self.in_osds + self.out_osds
|
|
|
|
self.max_pgs = self.config.get("max_pgs_per_pool_osd", 1200) * num_osds
|
2011-09-08 19:54:23 +00:00
|
|
|
if self.logger is not None:
|
2011-06-13 23:36:21 +00:00
|
|
|
self.log = lambda x: self.logger.info(x)
|
|
|
|
else:
|
|
|
|
def tmp(x):
|
|
|
|
print x
|
|
|
|
self.log = tmp
|
2011-08-25 22:18:42 +00:00
|
|
|
if self.config is None:
|
|
|
|
self.config = dict()
|
2012-01-11 14:59:41 +00:00
|
|
|
# prevent monitor from auto-marking things out while thrasher runs
|
2013-07-25 20:54:20 +00:00
|
|
|
# try both old and new tell syntax, in case we are testing old code
|
|
|
|
try:
|
|
|
|
manager.raw_cluster_cmd('--', 'tell', 'mon.*', 'injectargs',
|
2013-07-28 00:41:51 +00:00
|
|
|
'--mon-osd-down-out-interval 0')
|
2013-08-30 15:58:10 +00:00
|
|
|
except Exception:
|
2013-07-25 20:54:20 +00:00
|
|
|
manager.raw_cluster_cmd('--', 'mon', 'tell', '*', 'injectargs',
|
2013-07-28 00:41:51 +00:00
|
|
|
'--mon-osd-down-out-interval 0')
|
2012-03-30 01:07:30 +00:00
|
|
|
self.thread = gevent.spawn(self.do_thrash)
|
2011-06-13 23:36:21 +00:00
|
|
|
|
2013-07-26 02:43:08 +00:00
|
|
|
def kill_osd(self, osd=None, mark_down=False, mark_out=False):
|
2011-09-14 23:31:58 +00:00
|
|
|
if osd is None:
|
|
|
|
osd = random.choice(self.live_osds)
|
2012-11-09 00:37:52 +00:00
|
|
|
self.log("Killing osd %s, live_osds are %s" % (str(osd),str(self.live_osds)))
|
2011-09-14 23:31:58 +00:00
|
|
|
self.live_osds.remove(osd)
|
|
|
|
self.dead_osds.append(osd)
|
|
|
|
self.ceph_manager.kill_osd(osd)
|
2013-07-26 02:43:08 +00:00
|
|
|
if mark_down:
|
|
|
|
self.ceph_manager.mark_down_osd(osd)
|
ceph_manager: don't mark out an out osd on kill_osd(..., mark_out=True)
Fixes
2013-07-27T02:02:23.879 INFO:teuthology.task.thrashosds.thrasher:Recovered, killing an osd
2013-07-27T02:02:23.879 INFO:teuthology.task.thrashosds.thrasher:Killing osd 2, live_osds are [3, 4, 0, 1, 5, 2]
2013-07-27T02:02:24.547 INFO:teuthology.task.thrashosds.thrasher:Removing osd 2, in_osds are: [4, 0, 1, 5]
...
ValueError: list.remove(x): x not in list
Signed-off-by: Sage Weil <sage@inktank.com>
2013-07-27 14:48:55 +00:00
|
|
|
if mark_out and osd in self.in_osds:
|
2013-07-26 02:43:08 +00:00
|
|
|
self.out_osd(osd)
|
2011-09-14 23:31:58 +00:00
|
|
|
|
2012-02-01 00:13:59 +00:00
|
|
|
def blackhole_kill_osd(self, osd=None):
|
|
|
|
if osd is None:
|
|
|
|
osd = random.choice(self.live_osds)
|
2012-11-09 00:37:52 +00:00
|
|
|
self.log("Blackholing and then killing osd %s, live_osds are %s" % (str(osd),str(self.live_osds)))
|
2012-02-01 00:13:59 +00:00
|
|
|
self.live_osds.remove(osd)
|
|
|
|
self.dead_osds.append(osd)
|
|
|
|
self.ceph_manager.blackhole_kill_osd(osd)
|
|
|
|
|
2011-09-14 23:31:58 +00:00
|
|
|
def revive_osd(self, osd=None):
|
|
|
|
if osd is None:
|
|
|
|
osd = random.choice(self.dead_osds)
|
2012-11-09 00:37:52 +00:00
|
|
|
self.log("Reviving osd %s" % (str(osd),))
|
2011-09-14 23:31:58 +00:00
|
|
|
self.live_osds.append(osd)
|
|
|
|
self.dead_osds.remove(osd)
|
2013-05-06 21:10:11 +00:00
|
|
|
self.ceph_manager.revive_osd(osd, self.revive_timeout)
|
2011-09-14 23:31:58 +00:00
|
|
|
|
|
|
|
def out_osd(self, osd=None):
|
|
|
|
if osd is None:
|
|
|
|
osd = random.choice(self.in_osds)
|
2012-11-09 00:37:52 +00:00
|
|
|
self.log("Removing osd %s, in_osds are: %s" % (str(osd),str(self.in_osds)))
|
2011-09-14 23:31:58 +00:00
|
|
|
self.ceph_manager.mark_out_osd(osd)
|
|
|
|
self.in_osds.remove(osd)
|
|
|
|
self.out_osds.append(osd)
|
|
|
|
|
|
|
|
def in_osd(self, osd=None):
|
|
|
|
if osd is None:
|
|
|
|
osd = random.choice(self.out_osds)
|
|
|
|
if osd in self.dead_osds:
|
|
|
|
return self.revive_osd(osd)
|
2012-11-09 00:37:52 +00:00
|
|
|
self.log("Adding osd %s" % (str(osd),))
|
2011-06-13 23:36:21 +00:00
|
|
|
self.out_osds.remove(osd)
|
|
|
|
self.in_osds.append(osd)
|
|
|
|
self.ceph_manager.mark_in_osd(osd)
|
2012-12-11 22:21:48 +00:00
|
|
|
self.log("Added osd %s"%(str(osd),))
|
2011-06-13 23:36:21 +00:00
|
|
|
|
|
|
|
def all_up(self):
|
2011-09-14 23:31:58 +00:00
|
|
|
while len(self.dead_osds) > 0:
|
2012-12-11 22:21:48 +00:00
|
|
|
self.log("reviving osd")
|
2011-09-14 23:31:58 +00:00
|
|
|
self.revive_osd()
|
2011-06-13 23:36:21 +00:00
|
|
|
while len(self.out_osds) > 0:
|
2012-12-11 22:21:48 +00:00
|
|
|
self.log("inning osd")
|
2011-09-14 23:31:58 +00:00
|
|
|
self.in_osd()
|
2011-06-13 23:36:21 +00:00
|
|
|
|
|
|
|
def do_join(self):
|
|
|
|
self.stopping = True
|
2012-03-30 01:07:30 +00:00
|
|
|
self.thread.get()
|
2011-06-13 23:36:21 +00:00
|
|
|
|
2012-12-11 22:21:48 +00:00
|
|
|
def grow_pool(self):
|
|
|
|
pool = self.ceph_manager.get_pool()
|
|
|
|
self.log("Growing pool %s"%(pool,))
|
|
|
|
self.ceph_manager.expand_pool(pool, self.config.get('pool_grow_by', 10), self.max_pgs)
|
|
|
|
|
|
|
|
def fix_pgp_num(self):
|
|
|
|
pool = self.ceph_manager.get_pool()
|
|
|
|
self.log("fixing pg num pool %s"%(pool,))
|
|
|
|
self.ceph_manager.set_pool_pgpnum(pool)
|
|
|
|
|
2012-11-07 20:36:37 +00:00
|
|
|
def test_pool_min_size(self):
|
|
|
|
self.log("test_pool_min_size")
|
|
|
|
self.all_up()
|
|
|
|
self.ceph_manager.wait_for_recovery(
|
|
|
|
timeout=self.config.get('timeout')
|
|
|
|
)
|
|
|
|
the_one = random.choice(self.in_osds)
|
|
|
|
self.log("Killing everyone but %s", the_one)
|
|
|
|
to_kill = filter(lambda x: x != the_one, self.in_osds)
|
|
|
|
[self.kill_osd(i) for i in to_kill]
|
|
|
|
[self.out_osd(i) for i in to_kill]
|
|
|
|
time.sleep(self.config.get("test_pool_min_size_time", 10))
|
2012-11-09 00:37:52 +00:00
|
|
|
self.log("Killing %s" % (the_one,))
|
2012-11-07 20:36:37 +00:00
|
|
|
self.kill_osd(the_one)
|
|
|
|
self.out_osd(the_one)
|
2012-11-09 00:37:52 +00:00
|
|
|
self.log("Reviving everyone but %s" % (the_one,))
|
2012-11-07 20:36:37 +00:00
|
|
|
[self.revive_osd(i) for i in to_kill]
|
|
|
|
[self.in_osd(i) for i in to_kill]
|
2012-11-09 00:37:52 +00:00
|
|
|
self.log("Revived everyone but %s" % (the_one,))
|
2012-11-07 20:36:37 +00:00
|
|
|
self.log("Waiting for clean")
|
|
|
|
self.ceph_manager.wait_for_recovery(
|
|
|
|
timeout=self.config.get('timeout')
|
|
|
|
)
|
|
|
|
|
2013-01-24 00:13:22 +00:00
|
|
|
def inject_pause(self, conf_key, duration, check_after, should_be_down):
|
2013-01-26 21:13:08 +00:00
|
|
|
the_one = random.choice(self.live_osds)
|
2013-01-24 00:13:22 +00:00
|
|
|
self.log("inject_pause on {osd}".format(osd = the_one))
|
|
|
|
self.log(
|
|
|
|
"Testing {key} pause injection for duration {duration}".format(
|
|
|
|
key = conf_key,
|
|
|
|
duration = duration
|
|
|
|
))
|
|
|
|
self.log(
|
|
|
|
"Checking after {after}, should_be_down={shouldbedown}".format(
|
|
|
|
after = check_after,
|
|
|
|
shouldbedown = should_be_down
|
|
|
|
))
|
|
|
|
self.ceph_manager.set_config(the_one, **{conf_key:duration})
|
|
|
|
if not should_be_down:
|
|
|
|
return
|
|
|
|
time.sleep(check_after)
|
|
|
|
status = self.ceph_manager.get_osd_status()
|
|
|
|
assert the_one in status['down']
|
|
|
|
time.sleep(duration - check_after + 20)
|
|
|
|
status = self.ceph_manager.get_osd_status()
|
|
|
|
assert not the_one in status['down']
|
|
|
|
|
2013-03-21 21:37:38 +00:00
|
|
|
def test_backfill_full(self):
|
|
|
|
"""
|
|
|
|
Test backfills stopping when the replica fills up.
|
|
|
|
|
|
|
|
First, use osd_backfill_full_ratio to simulate a now full
|
|
|
|
osd by setting it to 0 on all of the OSDs.
|
|
|
|
|
|
|
|
Second, on a random subset, set
|
|
|
|
osd_debug_skip_full_check_in_backfill_reservation to force
|
|
|
|
the more complicated check in do_scan to be exercised.
|
|
|
|
|
|
|
|
Then, verify that all backfills stop.
|
|
|
|
"""
|
|
|
|
self.log("injecting osd_backfill_full_ratio = 0")
|
|
|
|
for i in self.live_osds:
|
|
|
|
self.ceph_manager.set_config(
|
|
|
|
i,
|
|
|
|
osd_debug_skip_full_check_in_backfill_reservation = random.choice(
|
|
|
|
['false', 'true']),
|
|
|
|
osd_backfill_full_ratio = 0)
|
|
|
|
for i in range(30):
|
|
|
|
status = self.ceph_manager.compile_pg_status()
|
|
|
|
if 'backfill' not in status.keys():
|
|
|
|
break
|
|
|
|
self.log(
|
|
|
|
"waiting for {still_going} backfills".format(
|
|
|
|
still_going=status.get('backfill')))
|
|
|
|
time.sleep(1)
|
|
|
|
assert('backfill' not in self.ceph_manager.compile_pg_status().keys())
|
|
|
|
for i in self.live_osds:
|
|
|
|
self.ceph_manager.set_config(
|
|
|
|
i,
|
|
|
|
osd_debug_skip_full_check_in_backfill_reservation = \
|
|
|
|
'false',
|
|
|
|
osd_backfill_full_ratio = 0.85)
|
|
|
|
|
2013-07-26 02:43:08 +00:00
|
|
|
def test_map_discontinuity(self):
|
|
|
|
"""
|
|
|
|
1) Allows the osds to recover
|
|
|
|
2) kills an osd
|
|
|
|
3) allows the remaining osds to recover
|
|
|
|
4) waits for some time
|
|
|
|
5) revives the osd
|
|
|
|
This sequence should cause the revived osd to have to handle
|
|
|
|
a map gap since the mons would have trimmed
|
|
|
|
"""
|
|
|
|
while len(self.in_osds) < (self.minin + 1):
|
|
|
|
self.in_osd()
|
|
|
|
self.log("Waiting for recovery")
|
2013-07-29 19:58:28 +00:00
|
|
|
self.ceph_manager.wait_for_all_up(
|
|
|
|
timeout=self.config.get('timeout')
|
|
|
|
)
|
2013-07-30 17:44:53 +00:00
|
|
|
# now we wait 20s for the pg status to change, if it takes longer,
|
|
|
|
# the test *should* fail!
|
|
|
|
time.sleep(20)
|
2013-07-26 02:43:08 +00:00
|
|
|
self.ceph_manager.wait_for_clean(
|
|
|
|
timeout=self.config.get('timeout')
|
|
|
|
)
|
2013-10-23 17:52:55 +00:00
|
|
|
|
2013-07-26 02:43:08 +00:00
|
|
|
self.log("Recovered, killing an osd")
|
|
|
|
self.kill_osd(mark_down=True, mark_out=True)
|
|
|
|
self.log("Waiting for clean again")
|
|
|
|
self.ceph_manager.wait_for_clean(
|
|
|
|
timeout=self.config.get('timeout')
|
|
|
|
)
|
|
|
|
self.log("Waiting for trim")
|
|
|
|
time.sleep(int(self.config.get("map_discontinuity_sleep_time", 40)))
|
|
|
|
self.revive_osd()
|
|
|
|
|
2011-09-14 23:31:58 +00:00
|
|
|
def choose_action(self):
|
2013-01-24 01:44:05 +00:00
|
|
|
chance_down = self.config.get('chance_down', 0.4)
|
2012-12-11 22:21:48 +00:00
|
|
|
chance_test_min_size = self.config.get('chance_test_min_size', 0)
|
2013-03-21 21:37:38 +00:00
|
|
|
chance_test_backfill_full= self.config.get('chance_test_backfill_full', 0)
|
2011-09-14 23:31:58 +00:00
|
|
|
if isinstance(chance_down, int):
|
|
|
|
chance_down = float(chance_down) / 100
|
2013-07-26 02:43:08 +00:00
|
|
|
minin = self.minin
|
2011-09-14 23:31:58 +00:00
|
|
|
minout = self.config.get("min_out", 0)
|
|
|
|
minlive = self.config.get("min_live", 2)
|
2012-01-16 22:40:34 +00:00
|
|
|
mindead = self.config.get("min_dead", 0)
|
2011-09-14 23:31:58 +00:00
|
|
|
|
2012-01-16 22:40:34 +00:00
|
|
|
self.log('choose_action: min_in %d min_out %d min_live %d min_dead %d' %
|
2012-12-11 22:21:48 +00:00
|
|
|
(minin, minout, minlive, mindead))
|
2011-09-14 23:31:58 +00:00
|
|
|
actions = []
|
|
|
|
if len(self.in_osds) > minin:
|
|
|
|
actions.append((self.out_osd, 1.0,))
|
2012-01-16 22:40:34 +00:00
|
|
|
if len(self.live_osds) > minlive and chance_down > 0:
|
|
|
|
actions.append((self.kill_osd, chance_down,))
|
2011-09-14 23:31:58 +00:00
|
|
|
if len(self.out_osds) > minout:
|
2012-02-12 22:36:11 +00:00
|
|
|
actions.append((self.in_osd, 1.7,))
|
2012-01-16 22:40:34 +00:00
|
|
|
if len(self.dead_osds) > mindead:
|
|
|
|
actions.append((self.revive_osd, 1.0,))
|
2012-12-11 22:21:48 +00:00
|
|
|
actions.append((self.grow_pool, self.config.get('chance_pgnum_grow', 0),))
|
|
|
|
actions.append((self.fix_pgp_num, self.config.get('chance_pgpnum_fix', 0),))
|
2012-11-07 20:36:37 +00:00
|
|
|
actions.append((self.test_pool_min_size, chance_test_min_size,))
|
2013-03-21 21:37:38 +00:00
|
|
|
actions.append((self.test_backfill_full, chance_test_backfill_full,))
|
2013-01-24 00:13:22 +00:00
|
|
|
for key in ['heartbeat_inject_failure', 'filestore_inject_stall']:
|
|
|
|
for scenario in [
|
|
|
|
(lambda: self.inject_pause(key,
|
|
|
|
self.config.get('pause_short', 3),
|
|
|
|
0,
|
|
|
|
False),
|
|
|
|
self.config.get('chance_inject_pause_short', 1),),
|
|
|
|
(lambda: self.inject_pause(key,
|
2013-01-24 20:50:24 +00:00
|
|
|
self.config.get('pause_long', 80),
|
|
|
|
self.config.get('pause_check_after', 70),
|
2013-01-24 00:13:22 +00:00
|
|
|
True),
|
2013-01-25 01:31:38 +00:00
|
|
|
self.config.get('chance_inject_pause_long', 0),)]:
|
2013-01-24 00:13:22 +00:00
|
|
|
actions.append(scenario)
|
2011-09-14 23:31:58 +00:00
|
|
|
|
|
|
|
total = sum([y for (x,y) in actions])
|
2012-01-16 22:43:56 +00:00
|
|
|
val = random.uniform(0, total)
|
|
|
|
for (action, prob) in actions:
|
|
|
|
if val < prob:
|
2011-09-14 23:31:58 +00:00
|
|
|
return action
|
2012-01-16 22:43:56 +00:00
|
|
|
val -= prob
|
2011-09-14 23:31:58 +00:00
|
|
|
return None
|
|
|
|
|
2011-06-13 23:36:21 +00:00
|
|
|
def do_thrash(self):
|
2011-09-08 19:54:23 +00:00
|
|
|
cleanint = self.config.get("clean_interval", 60)
|
2012-01-17 17:24:54 +00:00
|
|
|
maxdead = self.config.get("max_dead", 0);
|
2011-09-08 19:54:23 +00:00
|
|
|
delay = self.config.get("op_delay", 5)
|
2011-06-13 23:36:21 +00:00
|
|
|
self.log("starting do_thrash")
|
|
|
|
while not self.stopping:
|
2011-09-14 23:31:58 +00:00
|
|
|
self.log(" ".join([str(x) for x in ["in_osds: ", self.in_osds, " out_osds: ", self.out_osds,
|
|
|
|
"dead_osds: ", self.dead_osds, "live_osds: ",
|
|
|
|
self.live_osds]]))
|
2011-09-08 19:54:23 +00:00
|
|
|
if random.uniform(0,1) < (float(delay) / cleanint):
|
2012-01-10 21:57:55 +00:00
|
|
|
while len(self.dead_osds) > maxdead:
|
|
|
|
self.revive_osd()
|
2013-07-26 02:43:08 +00:00
|
|
|
if random.uniform(0, 1) < float(
|
|
|
|
self.config.get('chance_test_map_discontinuity', 0)):
|
|
|
|
self.test_map_discontinuity()
|
|
|
|
else:
|
|
|
|
self.ceph_manager.wait_for_recovery(
|
|
|
|
timeout=self.config.get('timeout')
|
|
|
|
)
|
2013-07-22 23:24:41 +00:00
|
|
|
time.sleep(self.clean_wait)
|
2011-09-14 23:31:58 +00:00
|
|
|
self.choose_action()()
|
2011-08-25 22:18:42 +00:00
|
|
|
time.sleep(delay)
|
2011-09-08 21:07:23 +00:00
|
|
|
self.all_up()
|
2011-06-13 23:36:21 +00:00
|
|
|
|
|
|
|
class CephManager:
|
2013-01-23 02:13:19 +00:00
|
|
|
def __init__(self, controller, ctx=None, config=None, logger=None):
|
2012-12-11 22:21:48 +00:00
|
|
|
self.lock = threading.RLock()
|
2011-09-14 23:31:58 +00:00
|
|
|
self.ctx = ctx
|
2013-01-23 02:13:19 +00:00
|
|
|
self.config = config
|
2011-06-13 23:36:21 +00:00
|
|
|
self.controller = controller
|
2013-11-14 22:01:51 +00:00
|
|
|
self.next_pool_id = 0
|
2011-06-13 23:36:21 +00:00
|
|
|
if (logger):
|
|
|
|
self.log = lambda x: logger.info(x)
|
|
|
|
else:
|
|
|
|
def tmp(x):
|
|
|
|
print x
|
|
|
|
self.log = tmp
|
task/ceph_manager: Fix NoneType config issue
kill_mon is getting a config set to None, which blows
up now due to the check for powercycle. Initialize
the config to an empty dict if we don't get anything
on init. This is the error showing up in teuthology:
2013-02-04T15:04:16.595 ERROR:teuthology.run_tasks:Manager failed: <contextlib.GeneratorContextManager object at 0x1fcafd0>
Traceback (most recent call last):
File "/var/lib/teuthworker/teuthology-master/teuthology/run_tasks.py", line 45, in run_tasks
suppress = manager.__exit__(*exc_info)
File "/usr/lib/python2.7/contextlib.py", line 24, in __exit__
self.gen.next()
File "/var/lib/teuthworker/teuthology-master/teuthology/task/mon_thrash.py", line 142, in task
thrash_proc.do_join()
File "/var/lib/teuthworker/teuthology-master/teuthology/task/mon_thrash.py", line 69, in do_join
self.thread.get()
File "/var/lib/teuthworker/teuthology-master/virtualenv/local/lib/python2.7/site-packages/gevent/greenlet.py", line 308, in get
raise self._exception
AttributeError: 'NoneType' object has no attribute 'get'
Signed-off-by: Sam Lang <sam.lang@inktank.com>
2013-02-05 16:38:48 +00:00
|
|
|
if self.config is None:
|
|
|
|
self.config = dict()
|
2013-06-04 21:12:07 +00:00
|
|
|
pools = self.list_pools()
|
2012-12-11 22:21:48 +00:00
|
|
|
self.pools = {}
|
2013-06-04 21:12:07 +00:00
|
|
|
for pool in pools:
|
|
|
|
self.pools[pool] = self.get_pool_property(pool, 'pg_num')
|
2011-06-13 23:36:21 +00:00
|
|
|
|
2011-10-04 00:49:13 +00:00
|
|
|
def raw_cluster_cmd(self, *args):
|
2013-01-23 20:37:39 +00:00
|
|
|
testdir = teuthology.get_testdir(self.ctx)
|
2011-10-04 00:49:13 +00:00
|
|
|
ceph_args = [
|
2013-09-06 20:08:01 +00:00
|
|
|
'adjust-ulimits',
|
2013-02-06 19:16:52 +00:00
|
|
|
'ceph-coverage',
|
2013-01-23 20:37:39 +00:00
|
|
|
'{tdir}/archive/coverage'.format(tdir=testdir),
|
2013-02-06 19:16:52 +00:00
|
|
|
'ceph',
|
2011-10-04 00:49:13 +00:00
|
|
|
]
|
|
|
|
ceph_args.extend(args)
|
2011-06-13 23:36:21 +00:00
|
|
|
proc = self.controller.run(
|
2011-10-04 00:49:13 +00:00
|
|
|
args=ceph_args,
|
|
|
|
stdout=StringIO(),
|
2011-06-13 23:36:21 +00:00
|
|
|
)
|
2011-10-04 00:49:13 +00:00
|
|
|
return proc.stdout.getvalue()
|
2011-06-13 23:36:21 +00:00
|
|
|
|
2013-03-28 22:24:33 +00:00
|
|
|
def raw_cluster_cmd_result(self, *args):
|
|
|
|
testdir = teuthology.get_testdir(self.ctx)
|
|
|
|
ceph_args = [
|
2013-09-06 20:08:01 +00:00
|
|
|
'adjust-ulimits',
|
2013-03-28 22:24:33 +00:00
|
|
|
'ceph-coverage',
|
|
|
|
'{tdir}/archive/coverage'.format(tdir=testdir),
|
|
|
|
'ceph',
|
|
|
|
]
|
|
|
|
ceph_args.extend(args)
|
|
|
|
proc = self.controller.run(
|
|
|
|
args=ceph_args,
|
|
|
|
check_status=False,
|
|
|
|
)
|
|
|
|
return proc.exitstatus
|
|
|
|
|
2013-01-19 01:11:09 +00:00
|
|
|
def do_rados(self, remote, cmd):
|
2013-02-02 17:00:17 +00:00
|
|
|
testdir = teuthology.get_testdir(self.ctx)
|
2013-01-19 01:11:09 +00:00
|
|
|
pre = [
|
2013-09-06 20:08:01 +00:00
|
|
|
'adjust-ulimits',
|
2013-02-06 19:16:52 +00:00
|
|
|
'ceph-coverage',
|
2013-02-02 17:00:17 +00:00
|
|
|
'{tdir}/archive/coverage'.format(tdir=testdir),
|
2013-02-06 19:16:52 +00:00
|
|
|
'rados',
|
2013-01-19 01:11:09 +00:00
|
|
|
];
|
|
|
|
pre.extend(cmd)
|
|
|
|
proc = remote.run(
|
|
|
|
args=pre,
|
|
|
|
wait=True,
|
|
|
|
)
|
|
|
|
return proc
|
|
|
|
|
2013-06-04 21:12:07 +00:00
|
|
|
def rados_write_objects(
|
|
|
|
self, pool, num_objects, size, timelimit, threads, cleanup=False):
|
|
|
|
args = [
|
|
|
|
'-p', pool,
|
|
|
|
'--num-objects', num_objects,
|
|
|
|
'-b', size,
|
|
|
|
'bench', timelimit,
|
|
|
|
'write'
|
|
|
|
]
|
|
|
|
if not cleanup: args.append('--no-cleanup')
|
|
|
|
return self.do_rados(self.controller, map(str, args))
|
|
|
|
|
2013-03-27 19:11:04 +00:00
|
|
|
def do_put(self, pool, obj, fname):
|
|
|
|
return self.do_rados(
|
|
|
|
self.controller,
|
|
|
|
[
|
|
|
|
'-p',
|
|
|
|
pool,
|
|
|
|
'put',
|
|
|
|
obj,
|
|
|
|
fname
|
|
|
|
]
|
|
|
|
)
|
|
|
|
|
|
|
|
def do_get(self, pool, obj, fname='/dev/null'):
|
|
|
|
return self.do_rados(
|
|
|
|
self.controller,
|
|
|
|
[
|
|
|
|
'-p',
|
|
|
|
pool,
|
|
|
|
'stat',
|
|
|
|
obj,
|
|
|
|
fname
|
|
|
|
]
|
|
|
|
)
|
|
|
|
|
2013-01-31 00:45:46 +00:00
|
|
|
def osd_admin_socket(self, osdnum, command, check_status=True):
|
2013-02-02 17:00:17 +00:00
|
|
|
testdir = teuthology.get_testdir(self.ctx)
|
2013-01-19 01:11:09 +00:00
|
|
|
remote = None
|
|
|
|
for _remote, roles_for_host in self.ctx.cluster.remotes.iteritems():
|
|
|
|
for id_ in teuthology.roles_of_type(roles_for_host, 'osd'):
|
2013-02-07 01:14:24 +00:00
|
|
|
if int(id_) == int(osdnum):
|
2013-01-19 01:11:09 +00:00
|
|
|
remote = _remote
|
|
|
|
assert remote is not None
|
|
|
|
args=[
|
2013-02-17 17:23:23 +00:00
|
|
|
'sudo',
|
2013-09-06 20:08:01 +00:00
|
|
|
'adjust-ulimits',
|
2013-02-17 17:23:23 +00:00
|
|
|
'ceph-coverage',
|
|
|
|
'{tdir}/archive/coverage'.format(tdir=testdir),
|
|
|
|
'ceph',
|
|
|
|
'--admin-daemon',
|
2013-02-21 20:52:30 +00:00
|
|
|
'/var/run/ceph/ceph-osd.{id}.asok'.format(id=osdnum),
|
|
|
|
]
|
2013-01-19 01:11:09 +00:00
|
|
|
args.extend(command)
|
|
|
|
return remote.run(
|
|
|
|
args=args,
|
|
|
|
stdout=StringIO(),
|
|
|
|
wait=True,
|
2013-01-31 00:45:46 +00:00
|
|
|
check_status=check_status
|
2013-01-19 01:11:09 +00:00
|
|
|
)
|
|
|
|
|
2013-03-27 19:11:04 +00:00
|
|
|
def get_pgid(self, pool, pgnum):
|
|
|
|
poolnum = self.get_pool_num(pool)
|
|
|
|
pg_str = "{poolnum}.{pgnum}".format(
|
|
|
|
poolnum=poolnum,
|
|
|
|
pgnum=pgnum)
|
|
|
|
return pg_str
|
|
|
|
|
|
|
|
def get_pg_replica(self, pool, pgnum):
|
|
|
|
"""
|
|
|
|
get replica for pool, pgnum (e.g. (data, 0)->0
|
|
|
|
"""
|
|
|
|
output = self.raw_cluster_cmd("pg", "dump", '--format=json')
|
|
|
|
j = json.loads('\n'.join(output.split('\n')[1:]))
|
|
|
|
pg_str = self.get_pgid(pool, pgnum)
|
|
|
|
for pg in j['pg_stats']:
|
|
|
|
if pg['pgid'] == pg_str:
|
|
|
|
return int(pg['acting'][-1])
|
|
|
|
assert False
|
|
|
|
|
2012-11-09 00:22:40 +00:00
|
|
|
def get_pg_primary(self, pool, pgnum):
|
|
|
|
"""
|
|
|
|
get primary for pool, pgnum (e.g. (data, 0)->0
|
|
|
|
"""
|
|
|
|
output = self.raw_cluster_cmd("pg", "dump", '--format=json')
|
|
|
|
j = json.loads('\n'.join(output.split('\n')[1:]))
|
2013-03-27 19:11:04 +00:00
|
|
|
pg_str = self.get_pgid(pool, pgnum)
|
2012-11-09 00:22:40 +00:00
|
|
|
for pg in j['pg_stats']:
|
|
|
|
if pg['pgid'] == pg_str:
|
|
|
|
return int(pg['acting'][0])
|
|
|
|
assert False
|
|
|
|
|
|
|
|
def get_pool_num(self, pool):
|
|
|
|
"""
|
|
|
|
get number for pool (e.g., data -> 2)
|
|
|
|
"""
|
2013-06-08 04:58:41 +00:00
|
|
|
out = self.raw_cluster_cmd('osd','dump','--format=json')
|
2012-11-09 00:22:40 +00:00
|
|
|
j = json.loads('\n'.join(out.split('\n')[1:]))
|
|
|
|
for i in j['pools']:
|
|
|
|
if i['pool_name'] == pool:
|
|
|
|
return int(i['pool'])
|
|
|
|
assert False
|
|
|
|
|
2013-06-04 21:12:07 +00:00
|
|
|
def list_pools(self):
|
|
|
|
"""
|
|
|
|
list all pool names
|
|
|
|
"""
|
|
|
|
out = self.raw_cluster_cmd('osd','dump','--format=json')
|
|
|
|
j = json.loads('\n'.join(out.split('\n')[1:]))
|
|
|
|
self.log(j['pools'])
|
|
|
|
return [str(i['pool_name']) for i in j['pools']]
|
|
|
|
|
|
|
|
def clear_pools(self):
|
|
|
|
"""
|
|
|
|
remove all pools
|
|
|
|
"""
|
|
|
|
[self.remove_pool(i) for i in self.list_pools()]
|
|
|
|
|
2013-01-29 03:22:42 +00:00
|
|
|
def kick_recovery_wq(self, osdnum):
|
|
|
|
return self.raw_cluster_cmd(
|
|
|
|
'tell', "osd.%d" % (int(osdnum),),
|
|
|
|
'debug',
|
|
|
|
'kick_recovery_wq',
|
|
|
|
'0')
|
|
|
|
|
2013-05-06 21:10:11 +00:00
|
|
|
def wait_run_admin_socket(self, osdnum, args=['version'], timeout=75):
|
2013-02-20 06:24:27 +00:00
|
|
|
tries = 0
|
2013-01-31 00:45:46 +00:00
|
|
|
while True:
|
|
|
|
proc = self.osd_admin_socket(
|
|
|
|
osdnum, args,
|
|
|
|
check_status=False)
|
|
|
|
if proc.exitstatus is 0:
|
|
|
|
break
|
|
|
|
else:
|
2013-02-20 06:24:27 +00:00
|
|
|
tries += 1
|
2013-05-06 21:10:11 +00:00
|
|
|
if (tries * 5) > timeout:
|
2013-02-27 00:47:33 +00:00
|
|
|
raise Exception('timed out waiting for admin_socket to appear after osd.{o} restart'.format(o=osdnum))
|
2013-01-31 00:45:46 +00:00
|
|
|
self.log(
|
|
|
|
"waiting on admin_socket for {osdnum}, {command}".format(
|
|
|
|
osdnum=osdnum,
|
|
|
|
command=args))
|
|
|
|
time.sleep(5)
|
|
|
|
|
2012-11-09 00:22:40 +00:00
|
|
|
def set_config(self, osdnum, **argdict):
|
2013-01-31 00:45:46 +00:00
|
|
|
for k,v in argdict.iteritems():
|
|
|
|
self.wait_run_admin_socket(
|
|
|
|
osdnum,
|
|
|
|
['config', 'set', str(k), str(v)])
|
2012-11-09 00:22:40 +00:00
|
|
|
|
2011-06-13 23:36:21 +00:00
|
|
|
def raw_cluster_status(self):
|
2013-08-16 16:24:55 +00:00
|
|
|
status = self.raw_cluster_cmd('status', '--format=json-pretty')
|
|
|
|
return json.loads(status)
|
2011-07-12 01:00:03 +00:00
|
|
|
|
2011-06-13 23:36:21 +00:00
|
|
|
def raw_osd_status(self):
|
2011-10-04 00:49:13 +00:00
|
|
|
return self.raw_cluster_cmd('osd', 'dump')
|
2011-06-13 23:36:21 +00:00
|
|
|
|
|
|
|
def get_osd_status(self):
|
|
|
|
osd_lines = filter(
|
2011-09-29 16:09:31 +00:00
|
|
|
lambda x: x.startswith('osd.') and (("up" in x) or ("down" in x)),
|
2011-06-13 23:36:21 +00:00
|
|
|
self.raw_osd_status().split('\n'))
|
|
|
|
self.log(osd_lines)
|
2011-09-29 16:09:31 +00:00
|
|
|
in_osds = [int(i[4:].split()[0]) for i in filter(
|
2011-06-13 23:36:21 +00:00
|
|
|
lambda x: " in " in x,
|
|
|
|
osd_lines)]
|
2011-09-29 16:09:31 +00:00
|
|
|
out_osds = [int(i[4:].split()[0]) for i in filter(
|
2011-06-13 23:36:21 +00:00
|
|
|
lambda x: " out " in x,
|
|
|
|
osd_lines)]
|
2011-09-29 16:09:31 +00:00
|
|
|
up_osds = [int(i[4:].split()[0]) for i in filter(
|
2011-06-13 23:36:21 +00:00
|
|
|
lambda x: " up " in x,
|
|
|
|
osd_lines)]
|
2011-09-29 16:09:31 +00:00
|
|
|
down_osds = [int(i[4:].split()[0]) for i in filter(
|
2011-06-13 23:36:21 +00:00
|
|
|
lambda x: " down " in x,
|
|
|
|
osd_lines)]
|
2011-09-14 23:31:58 +00:00
|
|
|
dead_osds = [int(x.id_) for x in
|
|
|
|
filter(lambda x: not x.running(), self.ctx.daemons.iter_daemons_of_role('osd'))]
|
|
|
|
live_osds = [int(x.id_) for x in
|
|
|
|
filter(lambda x: x.running(), self.ctx.daemons.iter_daemons_of_role('osd'))]
|
2011-07-12 01:00:03 +00:00
|
|
|
return { 'in' : in_osds, 'out' : out_osds, 'up' : up_osds,
|
2011-09-14 23:31:58 +00:00
|
|
|
'down' : down_osds, 'dead' : dead_osds, 'live' : live_osds, 'raw' : osd_lines }
|
2011-06-13 23:36:21 +00:00
|
|
|
|
|
|
|
def get_num_pgs(self):
|
|
|
|
status = self.raw_cluster_status()
|
2011-11-03 20:27:44 +00:00
|
|
|
self.log(status)
|
2013-08-16 16:24:55 +00:00
|
|
|
return status['pgmap']['num_pgs']
|
2011-06-13 23:36:21 +00:00
|
|
|
|
2013-11-14 22:01:51 +00:00
|
|
|
def create_pool_with_unique_name(self, pg_num=1):
|
|
|
|
name = ""
|
|
|
|
with self.lock:
|
|
|
|
name = "unique_pool_%s"%(str(self.next_pool_id),)
|
|
|
|
self.next_pool_id += 1
|
|
|
|
self.create_pool(name, pg_num)
|
|
|
|
return name
|
|
|
|
|
2012-12-11 22:21:48 +00:00
|
|
|
def create_pool(self, pool_name, pg_num=1):
|
|
|
|
with self.lock:
|
|
|
|
assert isinstance(pool_name, str)
|
|
|
|
assert isinstance(pg_num, int)
|
|
|
|
assert pool_name not in self.pools
|
|
|
|
self.log("creating pool_name %s"%(pool_name,))
|
|
|
|
self.raw_cluster_cmd('osd', 'pool', 'create', pool_name, str(pg_num))
|
|
|
|
self.pools[pool_name] = pg_num
|
|
|
|
|
|
|
|
def remove_pool(self, pool_name):
|
|
|
|
with self.lock:
|
|
|
|
assert isinstance(pool_name, str)
|
|
|
|
assert pool_name in self.pools
|
|
|
|
self.log("creating pool_name %s"%(pool_name,))
|
|
|
|
del self.pools[pool_name]
|
2013-01-24 18:07:10 +00:00
|
|
|
self.do_rados(
|
|
|
|
self.controller,
|
|
|
|
['rmpool', pool_name, pool_name, "--yes-i-really-really-mean-it"]
|
|
|
|
)
|
2012-12-11 22:21:48 +00:00
|
|
|
|
|
|
|
def get_pool(self):
|
|
|
|
with self.lock:
|
|
|
|
return random.choice(self.pools.keys());
|
|
|
|
|
|
|
|
def get_pool_pg_num(self, pool_name):
|
|
|
|
with self.lock:
|
|
|
|
assert isinstance(pool_name, str)
|
|
|
|
if pool_name in self.pools:
|
|
|
|
return self.pools[pool_name]
|
|
|
|
return 0;
|
|
|
|
|
|
|
|
def get_pool_property(self, pool_name, prop):
|
|
|
|
with self.lock:
|
|
|
|
assert isinstance(pool_name, str)
|
|
|
|
assert isinstance(prop, str)
|
|
|
|
output = self.raw_cluster_cmd(
|
|
|
|
'osd',
|
|
|
|
'pool',
|
|
|
|
'get',
|
|
|
|
pool_name,
|
|
|
|
prop)
|
|
|
|
return int(output.split()[1])
|
|
|
|
|
|
|
|
def set_pool_property(self, pool_name, prop, val):
|
|
|
|
with self.lock:
|
|
|
|
assert isinstance(pool_name, str)
|
|
|
|
assert isinstance(prop, str)
|
|
|
|
assert isinstance(val, int)
|
2013-03-28 22:24:33 +00:00
|
|
|
tries = 0
|
|
|
|
while True:
|
|
|
|
r = self.raw_cluster_cmd_result(
|
|
|
|
'osd',
|
|
|
|
'pool',
|
|
|
|
'set',
|
|
|
|
pool_name,
|
|
|
|
prop,
|
2013-07-29 19:12:24 +00:00
|
|
|
str(val))
|
2013-03-28 22:24:33 +00:00
|
|
|
if r != 11: # EAGAIN
|
|
|
|
break
|
|
|
|
tries += 1
|
|
|
|
if tries > 50:
|
|
|
|
raise Exception('timed out getting EAGAIN when setting pool property %s %s = %s' % (pool_name, prop, val))
|
|
|
|
self.log('got EAGAIN setting pool property, waiting a few seconds...')
|
|
|
|
time.sleep(2)
|
2012-12-11 22:21:48 +00:00
|
|
|
|
|
|
|
def expand_pool(self, pool_name, by, max_pgs):
|
|
|
|
with self.lock:
|
|
|
|
assert isinstance(pool_name, str)
|
|
|
|
assert isinstance(by, int)
|
|
|
|
assert pool_name in self.pools
|
|
|
|
if self.get_num_creating() > 0:
|
|
|
|
return
|
|
|
|
if (self.pools[pool_name] + by) > max_pgs:
|
|
|
|
return
|
|
|
|
self.log("increase pool size by %d"%(by,))
|
|
|
|
new_pg_num = self.pools[pool_name] + by
|
|
|
|
self.set_pool_property(pool_name, "pg_num", new_pg_num)
|
|
|
|
self.pools[pool_name] = new_pg_num
|
|
|
|
|
|
|
|
def set_pool_pgpnum(self, pool_name):
|
|
|
|
with self.lock:
|
|
|
|
assert isinstance(pool_name, str)
|
|
|
|
assert pool_name in self.pools
|
|
|
|
if self.get_num_creating() > 0:
|
|
|
|
return
|
|
|
|
self.set_pool_property(pool_name, 'pgp_num', self.pools[pool_name])
|
|
|
|
|
2012-02-24 17:22:03 +00:00
|
|
|
def list_pg_missing(self, pgid):
|
|
|
|
r = None
|
|
|
|
offset = {}
|
|
|
|
while True:
|
|
|
|
out = self.raw_cluster_cmd('--', 'pg',pgid,'list_missing',
|
|
|
|
json.dumps(offset))
|
2013-06-13 21:51:21 +00:00
|
|
|
j = json.loads(out)
|
2012-02-24 17:22:03 +00:00
|
|
|
if r is None:
|
|
|
|
r = j
|
|
|
|
else:
|
|
|
|
r['objects'].extend(j['objects'])
|
|
|
|
if not 'more' in j:
|
|
|
|
break
|
|
|
|
if j['more'] == 0:
|
|
|
|
break
|
|
|
|
offset = j['objects'][-1]['oid']
|
|
|
|
if 'more' in r:
|
|
|
|
del r['more']
|
|
|
|
return r
|
|
|
|
|
2011-10-17 22:32:22 +00:00
|
|
|
def get_pg_stats(self):
|
2013-06-08 04:58:41 +00:00
|
|
|
out = self.raw_cluster_cmd('pg','dump','--format=json')
|
2011-10-17 22:32:22 +00:00
|
|
|
j = json.loads('\n'.join(out.split('\n')[1:]))
|
|
|
|
return j['pg_stats']
|
|
|
|
|
2013-03-21 21:37:38 +00:00
|
|
|
def compile_pg_status(self):
|
|
|
|
ret = {}
|
|
|
|
j = self.get_pg_stats()
|
|
|
|
for pg in j:
|
|
|
|
for status in pg['state'].split('+'):
|
|
|
|
if status not in ret:
|
|
|
|
ret[status] = 0
|
|
|
|
ret[status] += 1
|
|
|
|
return ret
|
|
|
|
|
2013-03-27 19:11:04 +00:00
|
|
|
def pg_scrubbing(self, pool, pgnum):
|
|
|
|
pgstr = self.get_pgid(pool, pgnum)
|
|
|
|
stats = self.get_single_pg_stats(pgstr)
|
|
|
|
return 'scrub' in stats['state']
|
|
|
|
|
|
|
|
def pg_repairing(self, pool, pgnum):
|
|
|
|
pgstr = self.get_pgid(pool, pgnum)
|
|
|
|
stats = self.get_single_pg_stats(pgstr)
|
|
|
|
return 'repair' in stats['state']
|
|
|
|
|
|
|
|
def pg_inconsistent(self, pool, pgnum):
|
|
|
|
pgstr = self.get_pgid(pool, pgnum)
|
|
|
|
stats = self.get_single_pg_stats(pgstr)
|
|
|
|
return 'inconsistent' in stats['state']
|
|
|
|
|
|
|
|
def get_last_scrub_stamp(self, pool, pgnum):
|
|
|
|
stats = self.get_single_pg_stats(self.get_pgid(pool, pgnum))
|
|
|
|
return stats["last_scrub_stamp"]
|
|
|
|
|
|
|
|
def do_pg_scrub(self, pool, pgnum, stype):
|
|
|
|
init = self.get_last_scrub_stamp(pool, pgnum)
|
|
|
|
self.raw_cluster_cmd('pg', stype, self.get_pgid(pool, pgnum))
|
|
|
|
while init == self.get_last_scrub_stamp(pool, pgnum):
|
|
|
|
self.log("waiting for scrub type %s"%(stype,))
|
|
|
|
time.sleep(10)
|
|
|
|
|
2012-08-02 17:58:08 +00:00
|
|
|
def get_single_pg_stats(self, pgid):
|
|
|
|
all_stats = self.get_pg_stats()
|
|
|
|
|
|
|
|
for pg in all_stats:
|
|
|
|
if pg['pgid'] == pgid:
|
|
|
|
return pg
|
|
|
|
|
|
|
|
return None
|
|
|
|
|
2011-10-17 22:32:22 +00:00
|
|
|
def get_osd_dump(self):
|
2013-06-08 04:58:41 +00:00
|
|
|
out = self.raw_cluster_cmd('osd','dump','--format=json')
|
2011-10-17 22:32:22 +00:00
|
|
|
j = json.loads('\n'.join(out.split('\n')[1:]))
|
|
|
|
return j['osds']
|
|
|
|
|
2012-02-21 21:11:05 +00:00
|
|
|
def get_stuck_pgs(self, type_, threshold):
|
2013-07-13 21:07:28 +00:00
|
|
|
out = self.raw_cluster_cmd('pg','dump_stuck', type_, str(threshold),
|
|
|
|
'--format=json')
|
2013-06-13 21:51:21 +00:00
|
|
|
return json.loads(out)
|
2012-02-21 21:11:05 +00:00
|
|
|
|
2011-10-17 22:32:22 +00:00
|
|
|
def get_num_unfound_objects(self):
|
|
|
|
status = self.raw_cluster_status()
|
|
|
|
self.log(status)
|
2013-08-16 16:24:55 +00:00
|
|
|
return status['pgmap'].get('unfound_objects', 0)
|
2011-10-17 22:32:22 +00:00
|
|
|
|
2012-12-11 22:21:48 +00:00
|
|
|
def get_num_creating(self):
|
|
|
|
pgs = self.get_pg_stats()
|
|
|
|
num = 0
|
|
|
|
for pg in pgs:
|
|
|
|
if 'creating' in pg['state']:
|
|
|
|
num += 1
|
|
|
|
return num
|
|
|
|
|
2011-06-13 23:36:21 +00:00
|
|
|
def get_num_active_clean(self):
|
2011-10-19 17:04:07 +00:00
|
|
|
pgs = self.get_pg_stats()
|
|
|
|
num = 0
|
|
|
|
for pg in pgs:
|
2012-02-18 22:44:53 +00:00
|
|
|
if pg['state'].count('active') and pg['state'].count('clean') and not pg['state'].count('stale'):
|
2011-10-19 17:04:07 +00:00
|
|
|
num += 1
|
|
|
|
return num
|
2011-06-13 23:36:21 +00:00
|
|
|
|
2012-02-18 05:53:25 +00:00
|
|
|
def get_num_active_recovered(self):
|
|
|
|
pgs = self.get_pg_stats()
|
|
|
|
num = 0
|
|
|
|
for pg in pgs:
|
2013-01-25 00:23:33 +00:00
|
|
|
if pg['state'].count('active') and not pg['state'].count('recover') and not pg['state'].count('backfill') and not pg['state'].count('stale'):
|
2012-02-18 05:53:25 +00:00
|
|
|
num += 1
|
|
|
|
return num
|
|
|
|
|
2011-10-17 22:32:22 +00:00
|
|
|
def get_num_active(self):
|
|
|
|
pgs = self.get_pg_stats()
|
|
|
|
num = 0
|
|
|
|
for pg in pgs:
|
2012-02-18 22:44:53 +00:00
|
|
|
if pg['state'].count('active') and not pg['state'].count('stale'):
|
2011-10-17 22:32:22 +00:00
|
|
|
num += 1
|
|
|
|
return num
|
|
|
|
|
2012-07-28 17:22:02 +00:00
|
|
|
def get_num_down(self):
|
|
|
|
pgs = self.get_pg_stats()
|
|
|
|
num = 0
|
|
|
|
for pg in pgs:
|
|
|
|
if (pg['state'].count('down') and not pg['state'].count('stale')) or \
|
|
|
|
(pg['state'].count('incomplete') and not pg['state'].count('stale')):
|
|
|
|
num += 1
|
|
|
|
return num
|
|
|
|
|
2012-02-26 05:05:00 +00:00
|
|
|
def get_num_active_down(self):
|
|
|
|
pgs = self.get_pg_stats()
|
|
|
|
num = 0
|
|
|
|
for pg in pgs:
|
|
|
|
if (pg['state'].count('active') and not pg['state'].count('stale')) or \
|
2012-07-28 17:22:02 +00:00
|
|
|
(pg['state'].count('down') and not pg['state'].count('stale')) or \
|
|
|
|
(pg['state'].count('incomplete') and not pg['state'].count('stale')):
|
2012-02-26 05:05:00 +00:00
|
|
|
num += 1
|
|
|
|
return num
|
|
|
|
|
2011-06-13 23:36:21 +00:00
|
|
|
def is_clean(self):
|
|
|
|
return self.get_num_active_clean() == self.get_num_pgs()
|
|
|
|
|
2012-02-18 05:53:25 +00:00
|
|
|
def is_recovered(self):
|
|
|
|
return self.get_num_active_recovered() == self.get_num_pgs()
|
|
|
|
|
2012-02-26 05:05:00 +00:00
|
|
|
def is_active_or_down(self):
|
|
|
|
return self.get_num_active_down() == self.get_num_pgs()
|
|
|
|
|
2012-02-18 05:53:25 +00:00
|
|
|
def wait_for_clean(self, timeout=None):
|
|
|
|
self.log("waiting for clean")
|
2011-09-09 01:09:11 +00:00
|
|
|
start = time.time()
|
2012-01-24 19:28:38 +00:00
|
|
|
num_active_clean = self.get_num_active_clean()
|
2011-06-13 23:36:21 +00:00
|
|
|
while not self.is_clean():
|
2011-09-09 01:09:11 +00:00
|
|
|
if timeout is not None:
|
|
|
|
assert time.time() - start < timeout, \
|
|
|
|
'failed to become clean before timeout expired'
|
2012-01-24 19:28:38 +00:00
|
|
|
cur_active_clean = self.get_num_active_clean()
|
|
|
|
if cur_active_clean != num_active_clean:
|
|
|
|
start = time.time()
|
|
|
|
num_active_clean = cur_active_clean
|
2011-06-13 23:36:21 +00:00
|
|
|
time.sleep(3)
|
|
|
|
self.log("clean!")
|
|
|
|
|
2013-07-29 19:58:28 +00:00
|
|
|
def are_all_osds_up(self):
|
|
|
|
x = self.get_osd_dump()
|
|
|
|
return (len(x) == \
|
|
|
|
sum([(y['up'] > 0) for y in x]))
|
|
|
|
|
|
|
|
def wait_for_all_up(self, timeout=None):
|
|
|
|
self.log("waiting for all up")
|
|
|
|
start = time.time()
|
|
|
|
while not self.are_all_osds_up():
|
|
|
|
if timeout is not None:
|
|
|
|
assert time.time() - start < timeout, \
|
|
|
|
'timeout expired in wait_for_all_up'
|
|
|
|
time.sleep(3)
|
|
|
|
self.log("all up!")
|
|
|
|
|
2012-02-18 05:53:25 +00:00
|
|
|
def wait_for_recovery(self, timeout=None):
|
|
|
|
self.log("waiting for recovery to complete")
|
|
|
|
start = time.time()
|
|
|
|
num_active_recovered = self.get_num_active_recovered()
|
|
|
|
while not self.is_recovered():
|
|
|
|
if timeout is not None:
|
|
|
|
assert time.time() - start < timeout, \
|
|
|
|
'failed to recover before timeout expired'
|
|
|
|
cur_active_recovered = self.get_num_active_recovered()
|
|
|
|
if cur_active_recovered != num_active_recovered:
|
|
|
|
start = time.time()
|
|
|
|
num_active_recovered = cur_active_recovered
|
|
|
|
time.sleep(3)
|
|
|
|
self.log("recovered!")
|
|
|
|
|
2012-07-28 17:22:13 +00:00
|
|
|
def wait_for_active(self, timeout=None):
|
|
|
|
self.log("waiting for peering to complete")
|
|
|
|
start = time.time()
|
|
|
|
num_active = self.get_num_active()
|
|
|
|
while not self.is_active():
|
|
|
|
if timeout is not None:
|
|
|
|
assert time.time() - start < timeout, \
|
|
|
|
'failed to recover before timeout expired'
|
|
|
|
cur_active = self.get_num_active()
|
|
|
|
if cur_active != num_active:
|
|
|
|
start = time.time()
|
|
|
|
num_active = cur_active
|
|
|
|
time.sleep(3)
|
|
|
|
self.log("active!")
|
|
|
|
|
2012-02-26 05:05:00 +00:00
|
|
|
def wait_for_active_or_down(self, timeout=None):
|
|
|
|
self.log("waiting for peering to complete or become blocked")
|
|
|
|
start = time.time()
|
|
|
|
num_active_down = self.get_num_active_down()
|
|
|
|
while not self.is_active_or_down():
|
|
|
|
if timeout is not None:
|
|
|
|
assert time.time() - start < timeout, \
|
|
|
|
'failed to recover before timeout expired'
|
|
|
|
cur_active_down = self.get_num_active_down()
|
|
|
|
if cur_active_down != num_active_down:
|
|
|
|
start = time.time()
|
|
|
|
num_active_down = cur_active_down
|
|
|
|
time.sleep(3)
|
|
|
|
self.log("active or down!")
|
|
|
|
|
2011-10-17 22:32:22 +00:00
|
|
|
def osd_is_up(self, osd):
|
|
|
|
osds = self.get_osd_dump()
|
|
|
|
return osds[osd]['up'] > 0
|
|
|
|
|
|
|
|
def wait_till_osd_is_up(self, osd, timeout=None):
|
|
|
|
self.log('waiting for osd.%d to be up' % osd);
|
|
|
|
start = time.time()
|
|
|
|
while not self.osd_is_up(osd):
|
|
|
|
if timeout is not None:
|
|
|
|
assert time.time() - start < timeout, \
|
|
|
|
'osd.%d failed to come up before timeout expired' % osd
|
|
|
|
time.sleep(3)
|
|
|
|
self.log('osd.%d is up' % osd)
|
|
|
|
|
|
|
|
def is_active(self):
|
|
|
|
return self.get_num_active() == self.get_num_pgs()
|
|
|
|
|
|
|
|
def wait_till_active(self, timeout=None):
|
|
|
|
self.log("waiting till active")
|
|
|
|
start = time.time()
|
|
|
|
while not self.is_active():
|
|
|
|
if timeout is not None:
|
|
|
|
assert time.time() - start < timeout, \
|
|
|
|
'failed to become active before timeout expired'
|
|
|
|
time.sleep(3)
|
|
|
|
self.log("active!")
|
|
|
|
|
2011-06-13 23:36:21 +00:00
|
|
|
def mark_out_osd(self, osd):
|
2011-10-04 00:49:13 +00:00
|
|
|
self.raw_cluster_cmd('osd', 'out', str(osd))
|
2011-06-13 23:36:21 +00:00
|
|
|
|
2011-09-14 23:31:58 +00:00
|
|
|
def kill_osd(self, osd):
|
ceph_manager: use get() for self.config powercycle checks
I think this is what is going on...
Traceback (most recent call last):
File "/var/lib/teuthworker/teuthology-master/teuthology/contextutil.py", line 27, in nested
yield vars
File "/var/lib/teuthworker/teuthology-master/teuthology/task/ceph.py", line 1158, in task
yield
File "/var/lib/teuthworker/teuthology-master/teuthology/run_tasks.py", line 25, in run_tasks
manager = _run_one_task(taskname, ctx=ctx, config=config)
File "/var/lib/teuthworker/teuthology-master/teuthology/run_tasks.py", line 14, in _run_one_task
return fn(**kwargs)
File "/var/lib/teuthworker/teuthology-master/teuthology/task/dump_stuck.py", line 93, in task
manager.kill_osd(id_)
File "/var/lib/teuthworker/teuthology-master/teuthology/task/ceph_manager.py", line 665, in kill_osd
if 'powercycle' in self.config and self.config['powercycle']:
TypeError: argument of type 'NoneType' is not iterable
2013-02-03 05:01:08 +00:00
|
|
|
if self.config.get('powercycle'):
|
2013-01-23 02:13:19 +00:00
|
|
|
(remote,) = self.ctx.cluster.only('osd.{o}'.format(o=osd)).remotes.iterkeys()
|
|
|
|
self.log('kill_osd on osd.{o} doing powercycle of {s}'.format(o=osd, s=remote.name))
|
2013-03-13 03:21:12 +00:00
|
|
|
assert remote.console is not None, "powercycling requested but RemoteConsole is not initialized. Check ipmi config."
|
2013-01-23 02:13:19 +00:00
|
|
|
remote.console.power_off()
|
|
|
|
else:
|
|
|
|
self.ctx.daemons.get_daemon('osd', osd).stop()
|
2011-09-14 23:31:58 +00:00
|
|
|
|
2012-02-01 00:13:59 +00:00
|
|
|
def blackhole_kill_osd(self, osd):
|
|
|
|
self.raw_cluster_cmd('--', 'tell', 'osd.%d' % osd,
|
|
|
|
'injectargs', '--filestore-blackhole')
|
|
|
|
time.sleep(2)
|
|
|
|
self.ctx.daemons.get_daemon('osd', osd).stop()
|
|
|
|
|
2013-05-06 21:10:11 +00:00
|
|
|
def revive_osd(self, osd, timeout=75):
|
ceph_manager: use get() for self.config powercycle checks
I think this is what is going on...
Traceback (most recent call last):
File "/var/lib/teuthworker/teuthology-master/teuthology/contextutil.py", line 27, in nested
yield vars
File "/var/lib/teuthworker/teuthology-master/teuthology/task/ceph.py", line 1158, in task
yield
File "/var/lib/teuthworker/teuthology-master/teuthology/run_tasks.py", line 25, in run_tasks
manager = _run_one_task(taskname, ctx=ctx, config=config)
File "/var/lib/teuthworker/teuthology-master/teuthology/run_tasks.py", line 14, in _run_one_task
return fn(**kwargs)
File "/var/lib/teuthworker/teuthology-master/teuthology/task/dump_stuck.py", line 93, in task
manager.kill_osd(id_)
File "/var/lib/teuthworker/teuthology-master/teuthology/task/ceph_manager.py", line 665, in kill_osd
if 'powercycle' in self.config and self.config['powercycle']:
TypeError: argument of type 'NoneType' is not iterable
2013-02-03 05:01:08 +00:00
|
|
|
if self.config.get('powercycle'):
|
2013-01-23 02:13:19 +00:00
|
|
|
(remote,) = self.ctx.cluster.only('osd.{o}'.format(o=osd)).remotes.iterkeys()
|
|
|
|
self.log('kill_osd on osd.{o} doing powercycle of {s}'.format(o=osd, s=remote.name))
|
2013-03-13 03:21:12 +00:00
|
|
|
assert remote.console is not None, "powercycling requested but RemoteConsole is not initialized. Check ipmi config."
|
2013-01-23 02:27:41 +00:00
|
|
|
remote.console.power_on()
|
|
|
|
if not remote.console.check_status(300):
|
2013-02-27 00:47:33 +00:00
|
|
|
raise Exception('Failed to revive osd.{o} via ipmi'.format(o=osd))
|
2013-03-13 16:05:45 +00:00
|
|
|
teuthology.reconnect(self.ctx, 60, [remote])
|
2013-05-01 20:14:53 +00:00
|
|
|
ceph_task.mount_osd_data(self.ctx, remote, str(osd))
|
2013-03-13 03:17:16 +00:00
|
|
|
ceph_task.make_admin_daemon_dir(self.ctx, remote)
|
2013-01-23 02:27:41 +00:00
|
|
|
self.ctx.daemons.get_daemon('osd', osd).reset()
|
2011-09-14 23:31:58 +00:00
|
|
|
self.ctx.daemons.get_daemon('osd', osd).restart()
|
2013-08-24 21:09:53 +00:00
|
|
|
# wait for dump_ops_in_flight; this command doesn't appear
|
|
|
|
# until after the signal handler is installed and it is safe
|
|
|
|
# to stop the osd again without making valgrind leak checks
|
|
|
|
# unhappy. see #5924.
|
|
|
|
self.wait_run_admin_socket(osd,
|
|
|
|
args=['dump_ops_in_flight'],
|
|
|
|
timeout=timeout)
|
2011-09-14 23:31:58 +00:00
|
|
|
|
2011-08-25 22:19:30 +00:00
|
|
|
def mark_down_osd(self, osd):
|
2011-10-04 00:49:13 +00:00
|
|
|
self.raw_cluster_cmd('osd', 'down', str(osd))
|
2011-08-25 22:19:30 +00:00
|
|
|
|
2011-06-13 23:36:21 +00:00
|
|
|
def mark_in_osd(self, osd):
|
2011-10-04 00:49:13 +00:00
|
|
|
self.raw_cluster_cmd('osd', 'in', str(osd))
|
2011-11-09 06:02:58 +00:00
|
|
|
|
|
|
|
|
|
|
|
## monitors
|
|
|
|
|
2013-07-06 01:04:40 +00:00
|
|
|
def signal_mon(self, mon, sig):
|
|
|
|
self.ctx.daemons.get_daemon('mon', mon).signal(sig)
|
|
|
|
|
2011-11-09 06:02:58 +00:00
|
|
|
def kill_mon(self, mon):
|
ceph_manager: use get() for self.config powercycle checks
I think this is what is going on...
Traceback (most recent call last):
File "/var/lib/teuthworker/teuthology-master/teuthology/contextutil.py", line 27, in nested
yield vars
File "/var/lib/teuthworker/teuthology-master/teuthology/task/ceph.py", line 1158, in task
yield
File "/var/lib/teuthworker/teuthology-master/teuthology/run_tasks.py", line 25, in run_tasks
manager = _run_one_task(taskname, ctx=ctx, config=config)
File "/var/lib/teuthworker/teuthology-master/teuthology/run_tasks.py", line 14, in _run_one_task
return fn(**kwargs)
File "/var/lib/teuthworker/teuthology-master/teuthology/task/dump_stuck.py", line 93, in task
manager.kill_osd(id_)
File "/var/lib/teuthworker/teuthology-master/teuthology/task/ceph_manager.py", line 665, in kill_osd
if 'powercycle' in self.config and self.config['powercycle']:
TypeError: argument of type 'NoneType' is not iterable
2013-02-03 05:01:08 +00:00
|
|
|
if self.config.get('powercycle'):
|
2013-01-23 02:13:19 +00:00
|
|
|
(remote,) = self.ctx.cluster.only('mon.{m}'.format(m=mon)).remotes.iterkeys()
|
|
|
|
self.log('kill_mon on mon.{m} doing powercycle of {s}'.format(m=mon, s=remote.name))
|
2013-03-13 03:21:12 +00:00
|
|
|
assert remote.console is not None, "powercycling requested but RemoteConsole is not initialized. Check ipmi config."
|
2013-01-23 02:13:19 +00:00
|
|
|
remote.console.power_off()
|
|
|
|
else:
|
|
|
|
self.ctx.daemons.get_daemon('mon', mon).stop()
|
2011-11-09 06:02:58 +00:00
|
|
|
|
|
|
|
def revive_mon(self, mon):
|
ceph_manager: use get() for self.config powercycle checks
I think this is what is going on...
Traceback (most recent call last):
File "/var/lib/teuthworker/teuthology-master/teuthology/contextutil.py", line 27, in nested
yield vars
File "/var/lib/teuthworker/teuthology-master/teuthology/task/ceph.py", line 1158, in task
yield
File "/var/lib/teuthworker/teuthology-master/teuthology/run_tasks.py", line 25, in run_tasks
manager = _run_one_task(taskname, ctx=ctx, config=config)
File "/var/lib/teuthworker/teuthology-master/teuthology/run_tasks.py", line 14, in _run_one_task
return fn(**kwargs)
File "/var/lib/teuthworker/teuthology-master/teuthology/task/dump_stuck.py", line 93, in task
manager.kill_osd(id_)
File "/var/lib/teuthworker/teuthology-master/teuthology/task/ceph_manager.py", line 665, in kill_osd
if 'powercycle' in self.config and self.config['powercycle']:
TypeError: argument of type 'NoneType' is not iterable
2013-02-03 05:01:08 +00:00
|
|
|
if self.config.get('powercycle'):
|
2013-01-23 02:13:19 +00:00
|
|
|
(remote,) = self.ctx.cluster.only('mon.{m}'.format(m=mon)).remotes.iterkeys()
|
|
|
|
self.log('revive_mon on mon.{m} doing powercycle of {s}'.format(m=mon, s=remote.name))
|
2013-03-13 03:21:12 +00:00
|
|
|
assert remote.console is not None, "powercycling requested but RemoteConsole is not initialized. Check ipmi config."
|
2013-01-23 02:13:19 +00:00
|
|
|
remote.console.power_on()
|
2013-03-13 03:17:16 +00:00
|
|
|
ceph_task.make_admin_daemon_dir(self.ctx, remote)
|
2011-11-09 06:02:58 +00:00
|
|
|
self.ctx.daemons.get_daemon('mon', mon).restart()
|
|
|
|
|
|
|
|
def get_mon_status(self, mon):
|
|
|
|
addr = self.ctx.ceph.conf['mon.%s' % mon]['mon addr']
|
|
|
|
out = self.raw_cluster_cmd('-m', addr, 'mon_status')
|
|
|
|
return json.loads(out)
|
|
|
|
|
|
|
|
def get_mon_quorum(self):
|
|
|
|
out = self.raw_cluster_cmd('quorum_status')
|
|
|
|
j = json.loads(out)
|
2011-11-17 21:52:17 +00:00
|
|
|
self.log('quorum_status is %s' % out)
|
2011-11-09 06:02:58 +00:00
|
|
|
return j['quorum']
|
|
|
|
|
2011-11-17 19:05:12 +00:00
|
|
|
def wait_for_mon_quorum_size(self, size, timeout=300):
|
2011-11-09 06:02:58 +00:00
|
|
|
self.log('waiting for quorum size %d' % size)
|
|
|
|
start = time.time()
|
|
|
|
while not len(self.get_mon_quorum()) == size:
|
|
|
|
if timeout is not None:
|
|
|
|
assert time.time() - start < timeout, \
|
|
|
|
'failed to reach quorum size %d before timeout expired' % size
|
|
|
|
time.sleep(3)
|
|
|
|
self.log("quorum is size %d" % size)
|
2013-01-04 17:03:55 +00:00
|
|
|
|
|
|
|
def get_mon_health(self, debug=False):
|
|
|
|
out = self.raw_cluster_cmd('health', '--format=json')
|
|
|
|
if debug:
|
|
|
|
self.log('health:\n{h}'.format(h=out))
|
|
|
|
return json.loads(out)
|
2013-01-09 22:02:42 +00:00
|
|
|
|
|
|
|
## metadata servers
|
|
|
|
|
|
|
|
def kill_mds(self, mds):
|
ceph_manager: use get() for self.config powercycle checks
I think this is what is going on...
Traceback (most recent call last):
File "/var/lib/teuthworker/teuthology-master/teuthology/contextutil.py", line 27, in nested
yield vars
File "/var/lib/teuthworker/teuthology-master/teuthology/task/ceph.py", line 1158, in task
yield
File "/var/lib/teuthworker/teuthology-master/teuthology/run_tasks.py", line 25, in run_tasks
manager = _run_one_task(taskname, ctx=ctx, config=config)
File "/var/lib/teuthworker/teuthology-master/teuthology/run_tasks.py", line 14, in _run_one_task
return fn(**kwargs)
File "/var/lib/teuthworker/teuthology-master/teuthology/task/dump_stuck.py", line 93, in task
manager.kill_osd(id_)
File "/var/lib/teuthworker/teuthology-master/teuthology/task/ceph_manager.py", line 665, in kill_osd
if 'powercycle' in self.config and self.config['powercycle']:
TypeError: argument of type 'NoneType' is not iterable
2013-02-03 05:01:08 +00:00
|
|
|
if self.config.get('powercycle'):
|
2013-01-23 02:13:19 +00:00
|
|
|
(remote,) = self.ctx.cluster.only('mds.{m}'.format(m=mds)).remotes.iterkeys()
|
|
|
|
self.log('kill_mds on mds.{m} doing powercycle of {s}'.format(m=mds, s=remote.name))
|
2013-03-13 03:21:12 +00:00
|
|
|
assert remote.console is not None, "powercycling requested but RemoteConsole is not initialized. Check ipmi config."
|
2013-01-23 02:13:19 +00:00
|
|
|
remote.console.power_off()
|
|
|
|
else:
|
|
|
|
self.ctx.daemons.get_daemon('mds', mds).stop()
|
2013-01-09 22:02:42 +00:00
|
|
|
|
|
|
|
def kill_mds_by_rank(self, rank):
|
|
|
|
status = self.get_mds_status_by_rank(rank)
|
2013-01-23 02:13:19 +00:00
|
|
|
self.kill_mds(status['name'])
|
2013-01-09 22:02:42 +00:00
|
|
|
|
|
|
|
def revive_mds(self, mds, standby_for_rank=None):
|
ceph_manager: use get() for self.config powercycle checks
I think this is what is going on...
Traceback (most recent call last):
File "/var/lib/teuthworker/teuthology-master/teuthology/contextutil.py", line 27, in nested
yield vars
File "/var/lib/teuthworker/teuthology-master/teuthology/task/ceph.py", line 1158, in task
yield
File "/var/lib/teuthworker/teuthology-master/teuthology/run_tasks.py", line 25, in run_tasks
manager = _run_one_task(taskname, ctx=ctx, config=config)
File "/var/lib/teuthworker/teuthology-master/teuthology/run_tasks.py", line 14, in _run_one_task
return fn(**kwargs)
File "/var/lib/teuthworker/teuthology-master/teuthology/task/dump_stuck.py", line 93, in task
manager.kill_osd(id_)
File "/var/lib/teuthworker/teuthology-master/teuthology/task/ceph_manager.py", line 665, in kill_osd
if 'powercycle' in self.config and self.config['powercycle']:
TypeError: argument of type 'NoneType' is not iterable
2013-02-03 05:01:08 +00:00
|
|
|
if self.config.get('powercycle'):
|
2013-01-23 02:13:19 +00:00
|
|
|
(remote,) = self.ctx.cluster.only('mds.{m}'.format(m=mds)).remotes.iterkeys()
|
|
|
|
self.log('revive_mds on mds.{m} doing powercycle of {s}'.format(m=mds, s=remote.name))
|
2013-03-13 03:21:12 +00:00
|
|
|
assert remote.console is not None, "powercycling requested but RemoteConsole is not initialized. Check ipmi config."
|
2013-01-23 02:13:19 +00:00
|
|
|
remote.console.power_on()
|
2013-03-13 03:17:16 +00:00
|
|
|
ceph_task.make_admin_daemon_dir(self.ctx, remote)
|
2013-01-09 22:02:42 +00:00
|
|
|
args = []
|
|
|
|
if standby_for_rank:
|
|
|
|
args.extend(['--hot-standby', standby_for_rank])
|
|
|
|
self.ctx.daemons.get_daemon('mds', mds).restart(*args)
|
|
|
|
|
|
|
|
def revive_mds_by_rank(self, rank, standby_for_rank=None):
|
|
|
|
status = self.get_mds_status_by_rank(rank)
|
2013-01-23 02:13:19 +00:00
|
|
|
self.revive_mds(status['name'], standby_for_rank)
|
2013-01-09 22:02:42 +00:00
|
|
|
|
|
|
|
def get_mds_status(self, mds):
|
|
|
|
out = self.raw_cluster_cmd('mds', 'dump', '--format=json')
|
|
|
|
j = json.loads(' '.join(out.splitlines()[1:]))
|
|
|
|
# collate; for dup ids, larger gid wins.
|
|
|
|
for info in j['info'].itervalues():
|
|
|
|
if info['name'] == mds:
|
|
|
|
return info
|
|
|
|
return None
|
|
|
|
|
|
|
|
def get_mds_status_by_rank(self, rank):
|
|
|
|
out = self.raw_cluster_cmd('mds', 'dump', '--format=json')
|
|
|
|
j = json.loads(' '.join(out.splitlines()[1:]))
|
|
|
|
# collate; for dup ids, larger gid wins.
|
|
|
|
for info in j['info'].itervalues():
|
|
|
|
if info['rank'] == rank:
|
|
|
|
return info
|
|
|
|
return None
|
|
|
|
|
|
|
|
def get_mds_status_all(self):
|
|
|
|
out = self.raw_cluster_cmd('mds', 'dump', '--format=json')
|
|
|
|
j = json.loads(' '.join(out.splitlines()[1:]))
|
|
|
|
return j
|