From a5b19d2d73540b730392f8001c8601f2cecc1b51 Mon Sep 17 00:00:00 2001 From: Nathan Cutler Date: Sun, 9 Apr 2017 20:11:27 +0200 Subject: [PATCH] tests: Thrasher: handle "OSD has the store locked" gracefully On slower machines (VPS, OVH) it takes time for the OSD to go down. Fixes: http://tracker.ceph.com/issues/19556 Signed-off-by: Nathan Cutler --- qa/tasks/ceph_manager.py | 22 ++++++++++++++++------ 1 file changed, 16 insertions(+), 6 deletions(-) diff --git a/qa/tasks/ceph_manager.py b/qa/tasks/ceph_manager.py index 8ff2556a7a0..00e075ace4f 100644 --- a/qa/tasks/ceph_manager.py +++ b/qa/tasks/ceph_manager.py @@ -257,12 +257,22 @@ class Thrasher: break log.debug("ceph-objectstore-tool binary not present, trying again") - proc = exp_remote.run(args=cmd, wait=True, - check_status=False, stdout=StringIO()) - if proc.exitstatus: - raise Exception("ceph-objectstore-tool: " - "exp list-pgs failure with status {ret}". - format(ret=proc.exitstatus)) + # ceph-objectstore-tool might bogusly fail with "OSD has the store locked" + # see http://tracker.ceph.com/issues/19556 + with safe_while(sleep=15, tries=40, action="ceph-objectstore-tool --op list-pgs") as proceed: + while proceed(): + proc = exp_remote.run(args=cmd, wait=True, + check_status=False, + stdout=StringIO(), stderr=StringIO()) + if proc.exitstatus == 0: + break + elif proc.exitstatus == 1 and proc.stderr == "OSD has the store locked": + continue + else: + raise Exception("ceph-objectstore-tool: " + "exp list-pgs failure with status {ret}". + format(ret=proc.exitstatus)) + pgs = proc.stdout.getvalue().split('\n')[:-1] if len(pgs) == 0: self.log("No PGs found for osd.{osd}".format(osd=exp_osd))