ceph_manager: do_pg_scrub() don't resubmit a request for 2 minutes

Add 6 minute fatal timeout Warn when repair is being resubmitted because it causes races Signed-off-by: David Zafman <dzafman@redhat.com>
2025-03-11 02:39:05 +00:00 · 2016-09-14 15:43:02 -07:00 · 2016-09-14 15:43:02 -07:00 · 60cdb05380
commit 60cdb05380
parent 7625ebd8a2
1 changed files with 12 additions and 2 deletions
--- a/tasks/ceph_manager.py
+++ b/tasks/ceph_manager.py
@ -1561,10 +1561,20 @@ class CephManager:
        Scrub pg and wait for scrubbing to finish
        """
        init = self.get_last_scrub_stamp(pool, pgnum)
+        RESEND_TIMEOUT = 120    # Must be a multiple of SLEEP_TIME
+        FATAL_TIMEOUT = RESEND_TIMEOUT * 3
+        SLEEP_TIME = 10
+        timer = 0
        while init == self.get_last_scrub_stamp(pool, pgnum):
+            assert timer < FATAL_TIMEOUT, "fatal timeout trying to " + stype
            self.log("waiting for scrub type %s" % (stype,))
-            self.raw_cluster_cmd('pg', stype, self.get_pgid(pool, pgnum))
-            time.sleep(10)
+            if (timer % RESEND_TIMEOUT) == 0:
+                self.raw_cluster_cmd('pg', stype, self.get_pgid(pool, pgnum))
+                # The first time in this loop is the actual request
+                if timer != 0 and stype == "repair":
+                    self.log("WARNING: Resubmitted a non-idempotent repair")
+            time.sleep(SLEEP_TIME)
+            timer += SLEEP_TIME

    def get_single_pg_stats(self, pgid):
        """