From e3bb551067d5dd881e4fd81aeb7ea35328941968 Mon Sep 17 00:00:00 2001 From: shylesh kumar Date: Wed, 13 Apr 2016 15:06:45 -0400 Subject: [PATCH] Rados: wip-13531:Resolve stuck peering Signed-off-by: shylesh kumar --- .../singleton/all/resolve_stuck_peering.yaml | 10 ++ tasks/resolve_stuck_peering.py | 108 ++++++++++++++++++ 2 files changed, 118 insertions(+) create mode 100644 suites/rados/singleton/all/resolve_stuck_peering.yaml create mode 100644 tasks/resolve_stuck_peering.py diff --git a/suites/rados/singleton/all/resolve_stuck_peering.yaml b/suites/rados/singleton/all/resolve_stuck_peering.yaml new file mode 100644 index 00000000000..278ff1aa5ba --- /dev/null +++ b/suites/rados/singleton/all/resolve_stuck_peering.yaml @@ -0,0 +1,10 @@ +roles: +- [mon.0] +- [osd.0, osd.1, osd.2, client.0] + +tasks: +- install: +- ceph: + fs: xfs +- resolve_stuck_peering: + diff --git a/tasks/resolve_stuck_peering.py b/tasks/resolve_stuck_peering.py new file mode 100644 index 00000000000..f2ef5b594a4 --- /dev/null +++ b/tasks/resolve_stuck_peering.py @@ -0,0 +1,108 @@ +""" +Resolve stuck peering +""" +import logging +import time + +from teuthology import misc as teuthology +from util.rados import rados + +log = logging.getLogger(__name__) + +def task(ctx, config): + """ + Test handling resolve stuck peering + + requires 3 osds on a single test node + """ + if config is None: + config = {} + assert isinstance(config, dict), \ + 'Resolve stuck peering only accepts a dict for config' + + while len(ctx.manager.get_osd_status()['up']) < 3: + time.sleep(10) + + + ctx.manager.wait_for_clean() + + dummyfile = '/etc/fstab' + dummyfile1 = '/etc/resolv.conf' + + #create 1 PG pool + pool='foo' + log.info('creating pool foo') + ctx.manager.raw_cluster_cmd('osd', 'pool', 'create', '%s' % pool, '1') + + #set min_size of the pool to 1 + #so that we can continue with I/O + #when 2 osds are down + ctx.manager.set_pool_property(pool, "min_size", 1) + + osds = [0, 1, 2] + + primary = ctx.manager.get_pg_primary('foo', 0) + log.info("primary osd is %d", primary) + + others = list(osds) + others.remove(primary) + + log.info('writing initial objects') + first_mon = teuthology.get_first_mon(ctx, config) + (mon,) = ctx.cluster.only(first_mon).remotes.iterkeys() + #create few objects + for i in range(100): + rados(ctx, mon, ['-p', 'foo', 'put', 'existing_%d' % i, dummyfile]) + + ctx.manager.wait_for_clean() + + #kill other osds except primary + log.info('killing other osds except primary') + for i in others: + ctx.manager.kill_osd(i) + for i in others: + ctx.manager.mark_down_osd(i) + + + for i in range(100): + rados(ctx, mon, ['-p', 'foo', 'put', 'new_%d' % i, dummyfile1]) + + #kill primary osd + ctx.manager.kill_osd(primary) + ctx.manager.mark_down_osd(primary) + + #revive other 2 osds + for i in others: + ctx.manager.revive_osd(i) + + #make sure that pg is down + #Assuming pg number for single pg pool will start from 0 + pgnum=0 + pgstr = ctx.manager.get_pgid(pool, pgnum) + stats = ctx.manager.get_single_pg_stats(pgstr) + print stats['state'] + + timeout=60 + start=time.time() + + while 'down' not in stats['state']: + assert time.time() - start < timeout, \ + 'failed to reach down state before timeout expired' + + #mark primary as lost + ctx.manager.raw_cluster_cmd('osd', 'lost', '%d' % primary,\ + '--yes-i-really-mean-it') + + + #expect the pg status to be active+undersized+degraded + #pg should recover and become active+clean within timeout + stats = ctx.manager.get_single_pg_stats(pgstr) + print stats['state'] + + timeout=10 + start=time.time() + + while ctx.manager.get_num_down(): + assert time.time() - start < timeout, \ + 'failed to recover before timeout expired' +