Merge PR #38693 into master

* refs/pull/38693/head:
	qa: execute scrubs only on rank 0
	client: print debug information about resolved MDS
	qa: let Client::resolve_mds lookup the rank

Reviewed-by: Xiubo Li <xiubli@redhat.com>
This commit is contained in:
Patrick Donnelly 2021-01-08 10:47:07 -08:00
commit ea61b6a10f
No known key found for this signature in database
GPG Key ID: 3A2A7E25BEA8AADB
3 changed files with 65 additions and 117 deletions

View File

@ -1080,8 +1080,7 @@ class Filesystem(MDSCluster):
return self.json_asok(command, 'mds', info['name'], timeout=timeout)
def rank_tell(self, command, rank=0, status=None):
info = self.get_rank(rank=rank, status=status)
return json.loads(self.mon_manager.raw_cluster_cmd("tell", 'mds.{0}'.format(info['name']), *command))
return json.loads(self.mon_manager.raw_cluster_cmd("tell", f"mds.{self.id}:{rank}", *command))
def ranks_tell(self, command, status=None):
if status is None:

View File

@ -3,9 +3,8 @@ Thrash mds by simulating failures
"""
import logging
import contextlib
import threading
from gevent import sleep, killall, joinall, GreenletExit
from gevent import sleep, GreenletExit
from gevent.greenlet import Greenlet
from gevent.event import Event
from teuthology import misc as teuthology
@ -18,14 +17,23 @@ from tasks.thrasher import Thrasher
log = logging.getLogger(__name__)
class ForwardScrubber(Thrasher, Greenlet):
"""
ForwardScrubber::
The ForwardScrubber does forward scrubbing of file-systems during execution
of other tasks (workunits, etc).
"""
def __init__(self, fs, scrub_timeout=300, sleep_between_iterations=1):
super(ForwardScrubber, self).__init__()
class MDSRankScrubber(Thrasher, Greenlet):
def __init__(self, fs, mds_rank, scrub_timeout=300):
super(MDSRankScrubber, self).__init__()
self.logger = log.getChild('fs.[{f}]'.format(f=fs.name))
self.fs = fs
self.mds_rank = mds_rank
self.name = 'thrasher.fs.[{f}]'.format(f=fs.name)
self.stopping = Event()
self.scrub_timeout = scrub_timeout
self.sleep_between_iterations = sleep_between_iterations
def _run(self):
try:
@ -35,10 +43,28 @@ class MDSRankScrubber(Thrasher, Greenlet):
self.logger.exception("exception:")
# allow successful completion so gevent doesn't see an exception...
def do_scrub(self, path="/", recursive=True):
def stop(self):
self.stopping.set()
def do_scrub(self):
"""
Perform the file-system scrubbing
"""
self.logger.info(f'start scrubbing fs: {self.fs.name}')
try:
while not self.stopping.is_set():
self._scrub()
sleep(self.sleep_between_iterations)
except GreenletExit:
pass
self.logger.info(f'end scrubbing fs: {self.fs.name}')
def _scrub(self, path="/", recursive=True):
self.logger.info(f"scrubbing fs: {self.fs.name}")
recopt = ["recursive", "force"] if recursive else ["force"]
out_json = self.fs.rank_tell(["scrub", "start", path] + recopt,
rank=self.mds_rank)
out_json = self.fs.rank_tell(["scrub", "start", path] + recopt)
assert out_json is not None
tag = out_json['scrub_tag']
@ -47,15 +73,14 @@ class MDSRankScrubber(Thrasher, Greenlet):
assert out_json['return_code'] == 0
assert out_json['mode'] == 'asynchronous'
self.wait_until_scrub_complete(tag)
return self._wait_until_scrub_complete(tag)
def wait_until_scrub_complete(self, tag):
def _wait_until_scrub_complete(self, tag):
# time out after scrub_timeout seconds and assume as done
with contextutil.safe_while(sleep=30, tries=self.scrub_timeout//30) as proceed:
while proceed():
try:
out_json = self.fs.rank_tell(["scrub", "status"],
rank=self.mds_rank)
out_json = self.fs.rank_tell(["scrub", "status"])
assert out_json is not None
if out_json['status'] == "no active scrubs running":
self.logger.info("all active scrubs completed")
@ -72,96 +97,14 @@ class MDSRankScrubber(Thrasher, Greenlet):
self.logger.info("retrying scrub status command in a while")
pass
self.logger.info("timed out waiting for scrub to complete")
class ForwardScrubber(Thrasher, Greenlet):
"""
ForwardScrubber::
The ForwardScrubber does forward scrubbing of file-systems during execution
of other tasks (workunits, etc).
"""
def __init__(self, fs, scrub_timeout=300, sleep_between_iterations=1):
super(ForwardScrubber, self).__init__()
self.logger = log.getChild('fs.[{f}]'.format(f=fs.name))
self.fs = fs
self.name = 'thrasher.fs.[{f}]'.format(f=fs.name)
self.stopping = Event()
self.lock = threading.Lock()
self.scrubbers = []
self.scrub_timeout = scrub_timeout
self.sleep_between_iterations = sleep_between_iterations
def _run(self):
try:
self.do_scrub()
except Exception as e:
self.set_thrasher_exception(e)
self.logger.exception("exception:")
# allow successful completion so gevent doesn't see an exception...
def log(self, x):
"""Write data to the logger assigned to ForwardScrubber"""
self.logger.info(x)
def stop(self):
self.stopping.set()
self.lock.acquire()
try:
self.log("killing all scrubbers")
killall(self.scrubbers)
finally:
self.lock.release()
def do_scrub(self):
"""
Perform the file-system scrubbing
"""
self.log(f'starting do_scrub for fs: {self.fs.name}')
try:
while not self.stopping.is_set():
ranks = self.fs.get_all_mds_rank()
for r in ranks:
scrubber = MDSRankScrubber(self.fs, r, self.scrub_timeout)
self.lock.acquire()
try:
self.scrubbers.append(scrubber)
finally:
self.lock.release()
scrubber.start()
# wait for all scrubbers to complete
self.log("joining all scrubbers")
joinall(self.scrubbers)
for s in self.scrubbers:
if s.exception is not None:
raise RuntimeError('error during scrub thrashing')
self.lock.acquire()
try:
self.scrubbers.clear()
finally:
self.lock.release()
sleep(self.sleep_between_iterations)
except GreenletExit:
pass
def stop_all_fwd_scrubbers(thrashers):
for thrasher in thrashers:
if not isinstance(thrasher, ForwardScrubber):
continue
thrasher.stop()
thrasher.join()
if thrasher.exception is not None:
raise RuntimeError(f"error during scrub thrashing: {thrasher.exception}")
thrasher.join()
@contextlib.contextmanager

View File

@ -5794,14 +5794,14 @@ int Client::resolve_mds(
ceph_assert(targets != nullptr);
mds_role_t role;
std::stringstream ss;
int role_r = fsmap->parse_role(mds_spec, &role, ss);
CachedStackStringStream css;
int role_r = fsmap->parse_role(mds_spec, &role, *css);
if (role_r == 0) {
// We got a role, resolve it to a GID
ldout(cct, 10) << __func__ << ": resolved '" << mds_spec << "' to role '"
<< role << "'" << dendl;
targets->push_back(
fsmap->get_filesystem(role.fscid)->mds_map.get_info(role.rank).global_id);
auto& info = fsmap->get_filesystem(role.fscid)->mds_map.get_info(role.rank);
ldout(cct, 10) << __func__ << ": resolved " << mds_spec << " to role '"
<< role << "' aka " << info.human_name() << dendl;
targets->push_back(info.global_id);
return 0;
}
@ -5811,42 +5811,48 @@ int Client::resolve_mds(
// It is a possible GID
const mds_gid_t mds_gid = mds_gid_t(rank_or_gid);
if (fsmap->gid_exists(mds_gid)) {
ldout(cct, 10) << __func__ << ": validated GID " << mds_gid << dendl;
auto& info = fsmap->get_info_gid(mds_gid);
ldout(cct, 10) << __func__ << ": validated gid " << mds_gid << " aka "
<< info.human_name() << dendl;
targets->push_back(mds_gid);
return 0;
} else {
lderr(cct) << __func__ << ": GID " << mds_gid << " not in MDS map"
lderr(cct) << __func__ << ": gid " << mds_gid << " not in MDS map"
<< dendl;
lderr(cct) << "FSMap: " << *fsmap << dendl;
return -ENOENT;
}
} else if (mds_spec == "*") {
// It is a wildcard: use all MDSs
const auto mds_info = fsmap->get_mds_info();
const auto& mds_info = fsmap->get_mds_info();
ldout(cct, 10) << __func__ << ": resolving `*' to all MDS daemons" << dendl;
if (mds_info.empty()) {
lderr(cct) << __func__ << ": * passed but no MDS daemons found" << dendl;
lderr(cct) << __func__ << ": no MDS daemons found" << dendl;
lderr(cct) << "FSMap: " << *fsmap << dendl;
return -ENOENT;
}
for (const auto& i : mds_info) {
targets->push_back(i.first);
for (const auto& [gid, info] : mds_info) {
ldout(cct, 10) << __func__ << ": appending " << info.human_name() << " to targets" << dendl;
targets->push_back(gid);
}
return 0;
} else {
// It did not parse as an integer, it is not a wildcard, it must be a name
const mds_gid_t mds_gid = fsmap->find_mds_gid_by_name(mds_spec);
if (mds_gid == 0) {
lderr(cct) << "MDS ID '" << mds_spec << "' not found" << dendl;
lderr(cct) << __func__ << ": no MDS daemons found by name `" << mds_spec << "'" << dendl;
lderr(cct) << "FSMap: " << *fsmap << dendl;
return -ENOENT;
} else {
ldout(cct, 10) << __func__ << ": resolved ID '" << mds_spec
<< "' to GID " << mds_gid << dendl;
auto& info = fsmap->get_info_gid(mds_gid);
ldout(cct, 10) << __func__ << ": resolved name '" << mds_spec
<< "' to " << info.human_name() << dendl;
targets->push_back(mds_gid);
}
return 0;
}
return 0;
}