diff --git a/qa/tasks/cephfs/kernel_mount.py b/qa/tasks/cephfs/kernel_mount.py index 6b128f57247..950a1137d1b 100644 --- a/qa/tasks/cephfs/kernel_mount.py +++ b/qa/tasks/cephfs/kernel_mount.py @@ -1,6 +1,7 @@ from StringIO import StringIO import json import logging +import time from textwrap import dedent from teuthology.orchestra.run import CommandFailedError from teuthology import misc @@ -176,21 +177,31 @@ class KernelMount(CephFSMount): self.ipmi_user, self.ipmi_password, self.ipmi_domain) - con.power_off() + con.hard_reset(wait_for_login=False) self.mounted = False def kill_cleanup(self): assert not self.mounted - con = orchestra_remote.getRemoteConsole(self.client_remote.hostname, - self.ipmi_user, - self.ipmi_password, - self.ipmi_domain) - con.power_on() + # We need to do a sleep here because we don't know how long it will + # take for a hard_reset to be effected. + time.sleep(30) - # Wait for node to come back up after reboot - misc.reconnect(None, 300, [self.client_remote]) + try: + # Wait for node to come back up after reboot + misc.reconnect(None, 300, [self.client_remote]) + except: + # attempt to get some useful debug output: + con = orchestra_remote.getRemoteConsole(self.client_remote.hostname, + self.ipmi_user, + self.ipmi_password, + self.ipmi_domain) + con.check_status(timeout=60) + raise + + # Remove mount directory + self.client_remote.run(args=['uptime'], timeout=10) # Remove mount directory self.client_remote.run( diff --git a/qa/tasks/cephfs/test_client_recovery.py b/qa/tasks/cephfs/test_client_recovery.py index e4c3ce60691..7b305c5c3da 100644 --- a/qa/tasks/cephfs/test_client_recovery.py +++ b/qa/tasks/cephfs/test_client_recovery.py @@ -297,6 +297,9 @@ class TestClientRecovery(CephFSTestCase): # Simulate client death self.mount_a.kill() + # wait for it to die so it doesn't voluntarily release buffer cap + time.sleep(5) + try: # The waiter should get stuck waiting for the capability # held on the MDS by the now-dead client A