ceph_manager: do_pg_scrub() don't resubmit a request for 2 minutes

Add 6 minute fatal timeout
Warn when repair is being resubmitted because it causes races

Signed-off-by: David Zafman <dzafman@redhat.com>
This commit is contained in:
David Zafman 2016-09-14 15:43:02 -07:00
parent 7625ebd8a2
commit 60cdb05380

View File

@ -1561,10 +1561,20 @@ class CephManager:
Scrub pg and wait for scrubbing to finish
"""
init = self.get_last_scrub_stamp(pool, pgnum)
RESEND_TIMEOUT = 120 # Must be a multiple of SLEEP_TIME
FATAL_TIMEOUT = RESEND_TIMEOUT * 3
SLEEP_TIME = 10
timer = 0
while init == self.get_last_scrub_stamp(pool, pgnum):
assert timer < FATAL_TIMEOUT, "fatal timeout trying to " + stype
self.log("waiting for scrub type %s" % (stype,))
self.raw_cluster_cmd('pg', stype, self.get_pgid(pool, pgnum))
time.sleep(10)
if (timer % RESEND_TIMEOUT) == 0:
self.raw_cluster_cmd('pg', stype, self.get_pgid(pool, pgnum))
# The first time in this loop is the actual request
if timer != 0 and stype == "repair":
self.log("WARNING: Resubmitted a non-idempotent repair")
time.sleep(SLEEP_TIME)
timer += SLEEP_TIME
def get_single_pg_stats(self, pgid):
"""