qa/cephfs: lazy-force unmount clients that have been evicted

Before unmounting check if the client has been evicted and, if so, run
"umount -f -l" for the mount point of the client and cleanup the mount
right after it.

Attempting to unmount, cleanup or operate in any way over mount point
of a evicted client will hang the operation (and thereby our Python
code too). Lazy-force unmount prevents such hangs for our Python code
and also frees the mount point.

This commit also adds code to gather session info for kernel mounts
after mounting is successful. This is a necessity since network address
of session is needed to check if it is blocked by Ceph cluster.

Fixes: https://tracker.ceph.com/issues/56476
Signed-off-by: Rishabh Dave <ridave@redhat.com>
This commit is contained in:
Rishabh Dave 2022-07-20 15:16:53 +05:30
parent b79853fbe9
commit c279b47ec9
4 changed files with 65 additions and 13 deletions

View File

@ -312,6 +312,11 @@ class FuseMount(CephFSMount):
if cleanup:
self.cleanup()
return
if self.is_blocked():
self._run_umount_lf()
if cleanup:
self.cleanup()
return
try:
log.info('Running fusermount -u on {name}...'.format(name=self.client_remote.name))
@ -344,15 +349,8 @@ class FuseMount(CephFSMount):
""").format(self._fuse_conn))
self._fuse_conn = None
stderr = StringIO()
# make sure its unmounted
try:
self.client_remote.run(
args=['sudo', 'umount', '-l', '-f', self.hostfs_mntpt],
stderr=stderr, timeout=UMOUNT_TIMEOUT, omit_sudo=False)
except CommandFailedError:
if self.is_mounted():
raise
self._run_umount_lf()
self._fuse_conn = None
self.id = None
@ -386,6 +384,11 @@ class FuseMount(CephFSMount):
# mount -o remount (especially if the remount is stuck because MDSs
# are unavailable)
if self.is_blocked():
self._run_umount_lf()
self.cleanup()
return
# cleanup is set to to fail since clieanup must happen after umount is
# complete; otherwise following call to run.wait hangs.
self.umount(cleanup=False)

View File

@ -62,6 +62,13 @@ class KernelMount(CephFSMount):
self.enable_dynamic_debug()
self.ctx[f'kmount_count.{self.client_remote.hostname}'] = kmount_count + 1
self.gather_mount_info()
def gather_mount_info(self):
self.id = self._get_global_id()
self.get_global_inst()
self.get_global_addr()
def _run_mount_cmd(self, mntopts, check_status):
mount_cmd = self._get_mount_cmd(mntopts)
mountcmd_stdout, mountcmd_stderr = StringIO(), StringIO()
@ -133,6 +140,11 @@ class KernelMount(CephFSMount):
self.cleanup()
return
if self.is_blocked():
self._run_umount_lf()
self.cleanup()
return
log.debug('Unmounting client client.{id}...'.format(id=self.client_id))
try:
@ -173,11 +185,7 @@ class KernelMount(CephFSMount):
raise
# force delete the netns and umount
log.debug('Force/lazy unmounting on client.{id}...'.format(id=self.client_id))
self.client_remote.run(args=['sudo', 'umount', '-f', '-l',
self.mountpoint], timeout=timeout,
omit_sudo=False)
self._run_umount_lf()
self.cleanup()
def wait_until_mounted(self):

View File

@ -168,6 +168,12 @@ class CephFSMount(object):
get_file(self.client_remote, self.client_keyring_path,
sudo=True).decode())
def is_blocked(self):
self.fs = Filesystem(self.ctx, name=self.cephfs_name)
output = self.fs.mon_manager.raw_cluster_cmd(args='osd blocklist ls')
return self.addr in output
def is_stuck(self):
"""
Check if mount is stuck/in a hanged state.
@ -473,6 +479,19 @@ class CephFSMount(object):
self.mount(**kwargs)
self.wait_until_mounted()
def _run_umount_lf(self):
log.debug(f'Force/lazy unmounting on client.{self.client_id}')
try:
proc = self.client_remote.run(
args=f'sudo umount --lazy --force {self.hostfs_mntpt}',
timeout=UMOUNT_TIMEOUT, omit_sudo=False)
except CommandFailedError:
if self.is_mounted():
raise
return proc
def umount(self):
raise NotImplementedError()

View File

@ -55,6 +55,7 @@ vstart_runner.py -
"""
from io import StringIO
from json import loads
from collections import defaultdict
import getpass
import signal
@ -647,6 +648,12 @@ class LocalCephFSMount():
self.fs.wait_for_daemons()
log.info('Ready to start {}...'.format(type(self).__name__))
def is_blocked(self):
self.fs = LocalFilesystem(self.ctx, name=self.cephfs_name)
output = self.fs.mon_manager.raw_cluster_cmd(args='osd blocklist ls')
return self.addr in output
class LocalKernelMount(LocalCephFSMount, KernelMount):
def __init__(self, ctx, test_dir, client_id=None,
@ -661,6 +668,21 @@ class LocalKernelMount(LocalCephFSMount, KernelMount):
# Make vstart_runner compatible with teuth and qa/tasks/cephfs.
self._mount_bin = [os.path.join(BIN_PREFIX , 'mount.ceph')]
def get_global_addr(self):
self.get_global_inst()
self.addr = self.inst[self.inst.find(' ') + 1 : ]
return self.addr
def get_global_inst(self):
clients = self.client_remote.run(
args=f'{CEPH_CMD} tell mds.* session ls',
stdout=StringIO()).stdout.getvalue()
clients = loads(clients)
for c in clients:
if c['id'] == self.id:
self.inst = c['inst']
return self.inst
class LocalFuseMount(LocalCephFSMount, FuseMount):
def __init__(self, ctx, test_dir, client_id, client_keyring_path=None,