From b75544be86ba3300d386d1039fa3059afff085e9 Mon Sep 17 00:00:00 2001
From: Patrick Donnelly <pdonnell@redhat.com>
Date: Mon, 1 Jul 2019 11:21:13 -0700
Subject: [PATCH 1/2] qa: use hard_reset to reboot kclient

power_off may allow the mounts to gracefully unmount. We don't want this if the
kclient is stuck or we desire the client to "disappear" and come back.

Fixes: http://tracker.ceph.com/issues/37681
Depends-on: https://github.com/ceph/teuthology/pull/1296
Signed-off-by: Patrick Donnelly <pdonnell@redhat.com>
---
 qa/tasks/cephfs/kernel_mount.py | 27 +++++++++++++++++++--------
 1 file changed, 19 insertions(+), 8 deletions(-)

diff --git a/qa/tasks/cephfs/kernel_mount.py b/qa/tasks/cephfs/kernel_mount.py
index 6b128f57247..950a1137d1b 100644
--- a/qa/tasks/cephfs/kernel_mount.py
+++ b/qa/tasks/cephfs/kernel_mount.py
@@ -1,6 +1,7 @@
 from StringIO import StringIO
 import json
 import logging
+import time
 from textwrap import dedent
 from teuthology.orchestra.run import CommandFailedError
 from teuthology import misc
@@ -176,21 +177,31 @@ class KernelMount(CephFSMount):
                                                 self.ipmi_user,
                                                 self.ipmi_password,
                                                 self.ipmi_domain)
-        con.power_off()
+        con.hard_reset(wait_for_login=False)
 
         self.mounted = False
 
     def kill_cleanup(self):
         assert not self.mounted
 
-        con = orchestra_remote.getRemoteConsole(self.client_remote.hostname,
-                                                self.ipmi_user,
-                                                self.ipmi_password,
-                                                self.ipmi_domain)
-        con.power_on()
+        # We need to do a sleep here because we don't know how long it will
+        # take for a hard_reset to be effected.
+        time.sleep(30)
 
-        # Wait for node to come back up after reboot
-        misc.reconnect(None, 300, [self.client_remote])
+        try:
+            # Wait for node to come back up after reboot
+            misc.reconnect(None, 300, [self.client_remote])
+        except:
+            # attempt to get some useful debug output:
+            con = orchestra_remote.getRemoteConsole(self.client_remote.hostname,
+                                                    self.ipmi_user,
+                                                    self.ipmi_password,
+                                                    self.ipmi_domain)
+            con.check_status(timeout=60)
+            raise
+
+        # Remove mount directory
+        self.client_remote.run(args=['uptime'], timeout=10)
 
         # Remove mount directory
         self.client_remote.run(

From 6b83f43ba0a99ad86850974d354b17c3e23877be Mon Sep 17 00:00:00 2001
From: Patrick Donnelly <pdonnell@redhat.com>
Date: Thu, 25 Jul 2019 16:01:39 -0700
Subject: [PATCH 2/2] qa: wait for kernel client death

After sending the reboot command, we need to wait briefly for it to be
rebooted so that the kernel client doesn't voluntarily give up its Fb
cap.

Signed-off-by: Patrick Donnelly <pdonnell@redhat.com>
---
 qa/tasks/cephfs/test_client_recovery.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/qa/tasks/cephfs/test_client_recovery.py b/qa/tasks/cephfs/test_client_recovery.py
index e4c3ce60691..7b305c5c3da 100644
--- a/qa/tasks/cephfs/test_client_recovery.py
+++ b/qa/tasks/cephfs/test_client_recovery.py
@@ -297,6 +297,9 @@ class TestClientRecovery(CephFSTestCase):
         # Simulate client death
         self.mount_a.kill()
 
+        # wait for it to die so it doesn't voluntarily release buffer cap
+        time.sleep(5)
+
         try:
             # The waiter should get stuck waiting for the capability
             # held on the MDS by the now-dead client A