From 33899fdaac57db5266940f59f5ef956932aa1714 Mon Sep 17 00:00:00 2001
From: Milind Changire <mchangir@redhat.com>
Date: Sat, 11 Nov 2023 09:09:55 +0530
Subject: [PATCH] qa: bump up scrub status command timeout

A journal flush sometimes takes more than 120 seconds and so the 'scrub
status' command after blocking for more than 120 seconds is declared
failed causing the job to be declared as failed.

This bumping up of the timeout gives more time for the 'scrub status'
command to wait and eventually let the journal flush to complete.

Fixes: https://tracker.ceph.com/issues/63411
Signed-off-by: Milind Changire <mchangir@redhat.com>
---
 qa/tasks/ceph_manager.py      | 15 +++++++++------
 qa/tasks/cephfs/filesystem.py | 12 ++++++------
 qa/tasks/vstart_runner.py     |  4 +++-
 3 files changed, 18 insertions(+), 13 deletions(-)

diff --git a/qa/tasks/ceph_manager.py b/qa/tasks/ceph_manager.py
index 86b57028ee2..fa909c1a77c 100644
--- a/qa/tasks/ceph_manager.py
+++ b/qa/tasks/ceph_manager.py
@@ -1540,11 +1540,9 @@ class CephManager:
         self.cephadm = cephadm
         self.testdir = teuthology.get_testdir(self.ctx)
         # prefix args for ceph cmds to be executed
-        pre = ['adjust-ulimits', 'ceph-coverage',
-               f'{self.testdir}/archive/coverage']
-        self.CEPH_CMD = ['sudo'] + pre + ['timeout', '120', 'ceph',
-                                          '--cluster', self.cluster]
-        self.RADOS_CMD = pre + ['rados', '--cluster', self.cluster]
+        self.pre = ['adjust-ulimits', 'ceph-coverage',
+                    f'{self.testdir}/archive/coverage']
+        self.RADOS_CMD = self.pre + ['rados', '--cluster', self.cluster]
 
         pools = self.list_pools()
         self.pools = {}
@@ -1555,6 +1553,11 @@ class CephManager:
             except CommandFailedError:
                 self.log('Failed to get pg_num from pool %s, ignoring' % pool)
 
+    def get_ceph_cmd(self, **kwargs):
+        timeout = kwargs.pop('timeout', 120)
+        return ['sudo'] + self.pre + ['timeout', f'{timeout}', 'ceph',
+                                      '--cluster', self.cluster]
+
     def ceph(self, cmd, **kwargs):
         """
         Simple Ceph admin command wrapper around run_cluster_cmd.
@@ -1598,7 +1601,7 @@ class CephManager:
                            stdout=StringIO(),
                            check_status=kwargs.get('check_status', True))
         else:
-            kwargs['args'] = prefixcmd + self.CEPH_CMD + kwargs['args']
+            kwargs['args'] = prefixcmd + self.get_ceph_cmd(**kwargs) + kwargs['args']
             return self.controller.run(**kwargs)
 
     def raw_cluster_cmd(self, *args, **kwargs) -> str:
diff --git a/qa/tasks/cephfs/filesystem.py b/qa/tasks/cephfs/filesystem.py
index 3516bf4b86c..7de5653021b 100644
--- a/qa/tasks/cephfs/filesystem.py
+++ b/qa/tasks/cephfs/filesystem.py
@@ -1295,9 +1295,9 @@ class Filesystem(MDSCluster):
         info = self.get_rank(rank=rank, status=status)
         return self.json_asok(command, 'mds', info['name'], timeout=timeout)
 
-    def rank_tell(self, command, rank=0, status=None):
+    def rank_tell(self, command, rank=0, status=None, timeout=120):
         try:
-            out = self.get_ceph_cmd_stdout("tell", f"mds.{self.id}:{rank}", *command)
+            out = self.get_ceph_cmd_stdout("tell", f"mds.{self.id}:{rank}", *command, timeout=timeout)
             return json.loads(out)
         except json.decoder.JSONDecodeError:
             log.error("could not decode: {}".format(out))
@@ -1712,11 +1712,11 @@ class Filesystem(MDSCluster):
         self.set_max_mds(new_max_mds)
         return self.wait_for_daemons()
 
-    def run_scrub(self, cmd, rank=0):
-        return self.rank_tell(["scrub"] + cmd, rank)
+    def run_scrub(self, cmd, rank=0, timeout=300):
+        return self.rank_tell(["scrub"] + cmd, rank=rank, timeout=timeout)
 
     def get_scrub_status(self, rank=0):
-        return self.run_scrub(["status"], rank)
+        return self.run_scrub(["status"], rank=rank, timeout=300)
 
     def flush(self, rank=0):
         return self.rank_tell(["flush", "journal"], rank=rank)
@@ -1728,7 +1728,7 @@ class Filesystem(MDSCluster):
             result = "no active scrubs running"
         with contextutil.safe_while(sleep=sleep, tries=timeout//sleep) as proceed:
             while proceed():
-                out_json = self.rank_tell(["scrub", "status"], rank=rank)
+                out_json = self.rank_tell(["scrub", "status"], rank=rank, timeout=timeout)
                 assert out_json is not None
                 if not reverse:
                     if result in out_json['status']:
diff --git a/qa/tasks/vstart_runner.py b/qa/tasks/vstart_runner.py
index 96dc9fffba2..8d9dc6f1845 100644
--- a/qa/tasks/vstart_runner.py
+++ b/qa/tasks/vstart_runner.py
@@ -804,9 +804,11 @@ class LocalCephManager(CephManager):
         self.cephadm = False
         self.rook = False
         self.testdir = None
-        self.CEPH_CMD = [CEPH_CMD]
         self.RADOS_CMD = [RADOS_CMD]
 
+    def get_ceph_cmd(self, **kwargs):
+        return [CEPH_CMD]
+
     def find_remote(self, daemon_type, daemon_id):
         """
         daemon_type like 'mds', 'osd'