mirror of
https://github.com/ceph/ceph
synced 2025-02-23 02:57:21 +00:00
qa/cephfs: lazy-force unmount clients that have been evicted
Before unmounting check if the client has been evicted and, if so, run "umount -f -l" for the mount point of the client and cleanup the mount right after it. Attempting to unmount, cleanup or operate in any way over mount point of a evicted client will hang the operation (and thereby our Python code too). Lazy-force unmount prevents such hangs for our Python code and also frees the mount point. This commit also adds code to gather session info for kernel mounts after mounting is successful. This is a necessity since network address of session is needed to check if it is blocked by Ceph cluster. Fixes: https://tracker.ceph.com/issues/56476 Signed-off-by: Rishabh Dave <ridave@redhat.com>
This commit is contained in:
parent
b79853fbe9
commit
c279b47ec9
@ -312,6 +312,11 @@ class FuseMount(CephFSMount):
|
||||
if cleanup:
|
||||
self.cleanup()
|
||||
return
|
||||
if self.is_blocked():
|
||||
self._run_umount_lf()
|
||||
if cleanup:
|
||||
self.cleanup()
|
||||
return
|
||||
|
||||
try:
|
||||
log.info('Running fusermount -u on {name}...'.format(name=self.client_remote.name))
|
||||
@ -344,15 +349,8 @@ class FuseMount(CephFSMount):
|
||||
""").format(self._fuse_conn))
|
||||
self._fuse_conn = None
|
||||
|
||||
stderr = StringIO()
|
||||
# make sure its unmounted
|
||||
try:
|
||||
self.client_remote.run(
|
||||
args=['sudo', 'umount', '-l', '-f', self.hostfs_mntpt],
|
||||
stderr=stderr, timeout=UMOUNT_TIMEOUT, omit_sudo=False)
|
||||
except CommandFailedError:
|
||||
if self.is_mounted():
|
||||
raise
|
||||
self._run_umount_lf()
|
||||
|
||||
self._fuse_conn = None
|
||||
self.id = None
|
||||
@ -386,6 +384,11 @@ class FuseMount(CephFSMount):
|
||||
# mount -o remount (especially if the remount is stuck because MDSs
|
||||
# are unavailable)
|
||||
|
||||
if self.is_blocked():
|
||||
self._run_umount_lf()
|
||||
self.cleanup()
|
||||
return
|
||||
|
||||
# cleanup is set to to fail since clieanup must happen after umount is
|
||||
# complete; otherwise following call to run.wait hangs.
|
||||
self.umount(cleanup=False)
|
||||
|
@ -62,6 +62,13 @@ class KernelMount(CephFSMount):
|
||||
self.enable_dynamic_debug()
|
||||
self.ctx[f'kmount_count.{self.client_remote.hostname}'] = kmount_count + 1
|
||||
|
||||
self.gather_mount_info()
|
||||
|
||||
def gather_mount_info(self):
|
||||
self.id = self._get_global_id()
|
||||
self.get_global_inst()
|
||||
self.get_global_addr()
|
||||
|
||||
def _run_mount_cmd(self, mntopts, check_status):
|
||||
mount_cmd = self._get_mount_cmd(mntopts)
|
||||
mountcmd_stdout, mountcmd_stderr = StringIO(), StringIO()
|
||||
@ -133,6 +140,11 @@ class KernelMount(CephFSMount):
|
||||
self.cleanup()
|
||||
return
|
||||
|
||||
if self.is_blocked():
|
||||
self._run_umount_lf()
|
||||
self.cleanup()
|
||||
return
|
||||
|
||||
log.debug('Unmounting client client.{id}...'.format(id=self.client_id))
|
||||
|
||||
try:
|
||||
@ -173,11 +185,7 @@ class KernelMount(CephFSMount):
|
||||
raise
|
||||
|
||||
# force delete the netns and umount
|
||||
log.debug('Force/lazy unmounting on client.{id}...'.format(id=self.client_id))
|
||||
self.client_remote.run(args=['sudo', 'umount', '-f', '-l',
|
||||
self.mountpoint], timeout=timeout,
|
||||
omit_sudo=False)
|
||||
|
||||
self._run_umount_lf()
|
||||
self.cleanup()
|
||||
|
||||
def wait_until_mounted(self):
|
||||
|
@ -168,6 +168,12 @@ class CephFSMount(object):
|
||||
get_file(self.client_remote, self.client_keyring_path,
|
||||
sudo=True).decode())
|
||||
|
||||
def is_blocked(self):
|
||||
self.fs = Filesystem(self.ctx, name=self.cephfs_name)
|
||||
|
||||
output = self.fs.mon_manager.raw_cluster_cmd(args='osd blocklist ls')
|
||||
return self.addr in output
|
||||
|
||||
def is_stuck(self):
|
||||
"""
|
||||
Check if mount is stuck/in a hanged state.
|
||||
@ -473,6 +479,19 @@ class CephFSMount(object):
|
||||
self.mount(**kwargs)
|
||||
self.wait_until_mounted()
|
||||
|
||||
def _run_umount_lf(self):
|
||||
log.debug(f'Force/lazy unmounting on client.{self.client_id}')
|
||||
|
||||
try:
|
||||
proc = self.client_remote.run(
|
||||
args=f'sudo umount --lazy --force {self.hostfs_mntpt}',
|
||||
timeout=UMOUNT_TIMEOUT, omit_sudo=False)
|
||||
except CommandFailedError:
|
||||
if self.is_mounted():
|
||||
raise
|
||||
|
||||
return proc
|
||||
|
||||
def umount(self):
|
||||
raise NotImplementedError()
|
||||
|
||||
|
@ -55,6 +55,7 @@ vstart_runner.py -
|
||||
"""
|
||||
|
||||
from io import StringIO
|
||||
from json import loads
|
||||
from collections import defaultdict
|
||||
import getpass
|
||||
import signal
|
||||
@ -647,6 +648,12 @@ class LocalCephFSMount():
|
||||
self.fs.wait_for_daemons()
|
||||
log.info('Ready to start {}...'.format(type(self).__name__))
|
||||
|
||||
def is_blocked(self):
|
||||
self.fs = LocalFilesystem(self.ctx, name=self.cephfs_name)
|
||||
|
||||
output = self.fs.mon_manager.raw_cluster_cmd(args='osd blocklist ls')
|
||||
return self.addr in output
|
||||
|
||||
|
||||
class LocalKernelMount(LocalCephFSMount, KernelMount):
|
||||
def __init__(self, ctx, test_dir, client_id=None,
|
||||
@ -661,6 +668,21 @@ class LocalKernelMount(LocalCephFSMount, KernelMount):
|
||||
# Make vstart_runner compatible with teuth and qa/tasks/cephfs.
|
||||
self._mount_bin = [os.path.join(BIN_PREFIX , 'mount.ceph')]
|
||||
|
||||
def get_global_addr(self):
|
||||
self.get_global_inst()
|
||||
self.addr = self.inst[self.inst.find(' ') + 1 : ]
|
||||
return self.addr
|
||||
|
||||
def get_global_inst(self):
|
||||
clients = self.client_remote.run(
|
||||
args=f'{CEPH_CMD} tell mds.* session ls',
|
||||
stdout=StringIO()).stdout.getvalue()
|
||||
clients = loads(clients)
|
||||
for c in clients:
|
||||
if c['id'] == self.id:
|
||||
self.inst = c['inst']
|
||||
return self.inst
|
||||
|
||||
|
||||
class LocalFuseMount(LocalCephFSMount, FuseMount):
|
||||
def __init__(self, ctx, test_dir, client_id, client_keyring_path=None,
|
||||
|
Loading…
Reference in New Issue
Block a user