From 07eb03acee79067bd8decd092d3279b2a5c42398 Mon Sep 17 00:00:00 2001 From: John Spray Date: Wed, 3 Jun 2015 10:16:55 +0100 Subject: [PATCH 1/2] tasks/cephfs: time out on ceph-fuses that don't die For cases where we have e.g. poked the fuse abort file for a process, but it's still not dying. Because this is a special class of error (unlike e.g. when we force umount something because the network is gone) raise the error instead of trying again to kill the client. Fixes: #11835 Signed-off-by: John Spray --- tasks/cephfs/fuse_mount.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/tasks/cephfs/fuse_mount.py b/tasks/cephfs/fuse_mount.py index 5517f5f8474..88531fdbb18 100644 --- a/tasks/cephfs/fuse_mount.py +++ b/tasks/cephfs/fuse_mount.py @@ -7,6 +7,7 @@ import logging from textwrap import dedent from teuthology import misc +from teuthology.contextutil import MaxWhileTries from teuthology.orchestra import run from teuthology.orchestra.run import CommandFailedError from .mount import CephFSMount @@ -234,7 +235,12 @@ class FuseMount(CephFSMount): try: if self.fuse_daemon: - self.fuse_daemon.wait() + # Permit a timeout, so that we do not block forever + run.wait([self.fuse_daemon], 30) + except MaxWhileTries: + log.error("process failed to terminate after unmount. This probably" + "indicates a bug within ceph-fuse.") + raise except CommandFailedError: pass From f97fde6c2695145e38a8bc3c44cbfeb69d2465d1 Mon Sep 17 00:00:00 2001 From: Greg Farnum Date: Wed, 16 Sep 2015 14:27:14 -0700 Subject: [PATCH 2/2] tasks/cephfs: switch unmount timeout to 15 minutes Our ffsb and fsync tests contain so many small writes at random offsets that it can take >10 minutes to commit all of them to disk if we get a slower OSD cluster. 15 minutes is still a plenty-fast timeout for this stage compared to just hanging and losing the logs! Signed-off-by: Greg Farnum --- tasks/cephfs/fuse_mount.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tasks/cephfs/fuse_mount.py b/tasks/cephfs/fuse_mount.py index 88531fdbb18..60722a2e9f2 100644 --- a/tasks/cephfs/fuse_mount.py +++ b/tasks/cephfs/fuse_mount.py @@ -236,7 +236,7 @@ class FuseMount(CephFSMount): try: if self.fuse_daemon: # Permit a timeout, so that we do not block forever - run.wait([self.fuse_daemon], 30) + run.wait([self.fuse_daemon], 900) except MaxWhileTries: log.error("process failed to terminate after unmount. This probably" "indicates a bug within ceph-fuse.")