From b75544be86ba3300d386d1039fa3059afff085e9 Mon Sep 17 00:00:00 2001 From: Patrick Donnelly Date: Mon, 1 Jul 2019 11:21:13 -0700 Subject: [PATCH 1/2] qa: use hard_reset to reboot kclient power_off may allow the mounts to gracefully unmount. We don't want this if the kclient is stuck or we desire the client to "disappear" and come back. Fixes: http://tracker.ceph.com/issues/37681 Depends-on: https://github.com/ceph/teuthology/pull/1296 Signed-off-by: Patrick Donnelly --- qa/tasks/cephfs/kernel_mount.py | 27 +++++++++++++++++++-------- 1 file changed, 19 insertions(+), 8 deletions(-) diff --git a/qa/tasks/cephfs/kernel_mount.py b/qa/tasks/cephfs/kernel_mount.py index 6b128f57247..950a1137d1b 100644 --- a/qa/tasks/cephfs/kernel_mount.py +++ b/qa/tasks/cephfs/kernel_mount.py @@ -1,6 +1,7 @@ from StringIO import StringIO import json import logging +import time from textwrap import dedent from teuthology.orchestra.run import CommandFailedError from teuthology import misc @@ -176,21 +177,31 @@ class KernelMount(CephFSMount): self.ipmi_user, self.ipmi_password, self.ipmi_domain) - con.power_off() + con.hard_reset(wait_for_login=False) self.mounted = False def kill_cleanup(self): assert not self.mounted - con = orchestra_remote.getRemoteConsole(self.client_remote.hostname, - self.ipmi_user, - self.ipmi_password, - self.ipmi_domain) - con.power_on() + # We need to do a sleep here because we don't know how long it will + # take for a hard_reset to be effected. + time.sleep(30) - # Wait for node to come back up after reboot - misc.reconnect(None, 300, [self.client_remote]) + try: + # Wait for node to come back up after reboot + misc.reconnect(None, 300, [self.client_remote]) + except: + # attempt to get some useful debug output: + con = orchestra_remote.getRemoteConsole(self.client_remote.hostname, + self.ipmi_user, + self.ipmi_password, + self.ipmi_domain) + con.check_status(timeout=60) + raise + + # Remove mount directory + self.client_remote.run(args=['uptime'], timeout=10) # Remove mount directory self.client_remote.run( From 6b83f43ba0a99ad86850974d354b17c3e23877be Mon Sep 17 00:00:00 2001 From: Patrick Donnelly Date: Thu, 25 Jul 2019 16:01:39 -0700 Subject: [PATCH 2/2] qa: wait for kernel client death After sending the reboot command, we need to wait briefly for it to be rebooted so that the kernel client doesn't voluntarily give up its Fb cap. Signed-off-by: Patrick Donnelly --- qa/tasks/cephfs/test_client_recovery.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/qa/tasks/cephfs/test_client_recovery.py b/qa/tasks/cephfs/test_client_recovery.py index e4c3ce60691..7b305c5c3da 100644 --- a/qa/tasks/cephfs/test_client_recovery.py +++ b/qa/tasks/cephfs/test_client_recovery.py @@ -297,6 +297,9 @@ class TestClientRecovery(CephFSTestCase): # Simulate client death self.mount_a.kill() + # wait for it to die so it doesn't voluntarily release buffer cap + time.sleep(5) + try: # The waiter should get stuck waiting for the capability # held on the MDS by the now-dead client A