diff --git a/qa/tasks/cephfs/filesystem.py b/qa/tasks/cephfs/filesystem.py index 8df09412576..71f7ed3af6e 100644 --- a/qa/tasks/cephfs/filesystem.py +++ b/qa/tasks/cephfs/filesystem.py @@ -1080,8 +1080,7 @@ class Filesystem(MDSCluster): return self.json_asok(command, 'mds', info['name'], timeout=timeout) def rank_tell(self, command, rank=0, status=None): - info = self.get_rank(rank=rank, status=status) - return json.loads(self.mon_manager.raw_cluster_cmd("tell", 'mds.{0}'.format(info['name']), *command)) + return json.loads(self.mon_manager.raw_cluster_cmd("tell", f"mds.{self.id}:{rank}", *command)) def ranks_tell(self, command, status=None): if status is None: diff --git a/qa/tasks/fwd_scrub.py b/qa/tasks/fwd_scrub.py index 4bd2529feba..15393c20992 100644 --- a/qa/tasks/fwd_scrub.py +++ b/qa/tasks/fwd_scrub.py @@ -3,9 +3,8 @@ Thrash mds by simulating failures """ import logging import contextlib -import threading -from gevent import sleep, killall, joinall, GreenletExit +from gevent import sleep, GreenletExit from gevent.greenlet import Greenlet from gevent.event import Event from teuthology import misc as teuthology @@ -18,14 +17,23 @@ from tasks.thrasher import Thrasher log = logging.getLogger(__name__) +class ForwardScrubber(Thrasher, Greenlet): + """ + ForwardScrubber:: + + The ForwardScrubber does forward scrubbing of file-systems during execution + of other tasks (workunits, etc). + """ + + def __init__(self, fs, scrub_timeout=300, sleep_between_iterations=1): + super(ForwardScrubber, self).__init__() -class MDSRankScrubber(Thrasher, Greenlet): - def __init__(self, fs, mds_rank, scrub_timeout=300): - super(MDSRankScrubber, self).__init__() self.logger = log.getChild('fs.[{f}]'.format(f=fs.name)) self.fs = fs - self.mds_rank = mds_rank + self.name = 'thrasher.fs.[{f}]'.format(f=fs.name) + self.stopping = Event() self.scrub_timeout = scrub_timeout + self.sleep_between_iterations = sleep_between_iterations def _run(self): try: @@ -35,10 +43,28 @@ class MDSRankScrubber(Thrasher, Greenlet): self.logger.exception("exception:") # allow successful completion so gevent doesn't see an exception... - def do_scrub(self, path="/", recursive=True): + def stop(self): + self.stopping.set() + + def do_scrub(self): + """ + Perform the file-system scrubbing + """ + self.logger.info(f'start scrubbing fs: {self.fs.name}') + + try: + while not self.stopping.is_set(): + self._scrub() + sleep(self.sleep_between_iterations) + except GreenletExit: + pass + + self.logger.info(f'end scrubbing fs: {self.fs.name}') + + def _scrub(self, path="/", recursive=True): + self.logger.info(f"scrubbing fs: {self.fs.name}") recopt = ["recursive", "force"] if recursive else ["force"] - out_json = self.fs.rank_tell(["scrub", "start", path] + recopt, - rank=self.mds_rank) + out_json = self.fs.rank_tell(["scrub", "start", path] + recopt) assert out_json is not None tag = out_json['scrub_tag'] @@ -47,15 +73,14 @@ class MDSRankScrubber(Thrasher, Greenlet): assert out_json['return_code'] == 0 assert out_json['mode'] == 'asynchronous' - self.wait_until_scrub_complete(tag) + return self._wait_until_scrub_complete(tag) - def wait_until_scrub_complete(self, tag): + def _wait_until_scrub_complete(self, tag): # time out after scrub_timeout seconds and assume as done with contextutil.safe_while(sleep=30, tries=self.scrub_timeout//30) as proceed: while proceed(): try: - out_json = self.fs.rank_tell(["scrub", "status"], - rank=self.mds_rank) + out_json = self.fs.rank_tell(["scrub", "status"]) assert out_json is not None if out_json['status'] == "no active scrubs running": self.logger.info("all active scrubs completed") @@ -72,96 +97,14 @@ class MDSRankScrubber(Thrasher, Greenlet): self.logger.info("retrying scrub status command in a while") pass - self.logger.info("timed out waiting for scrub to complete") - - -class ForwardScrubber(Thrasher, Greenlet): - """ - ForwardScrubber:: - - The ForwardScrubber does forward scrubbing of file-systems during execution - of other tasks (workunits, etc). - - """ - def __init__(self, fs, scrub_timeout=300, sleep_between_iterations=1): - super(ForwardScrubber, self).__init__() - - self.logger = log.getChild('fs.[{f}]'.format(f=fs.name)) - self.fs = fs - self.name = 'thrasher.fs.[{f}]'.format(f=fs.name) - self.stopping = Event() - self.lock = threading.Lock() - self.scrubbers = [] - self.scrub_timeout = scrub_timeout - self.sleep_between_iterations = sleep_between_iterations - - def _run(self): - try: - self.do_scrub() - except Exception as e: - self.set_thrasher_exception(e) - self.logger.exception("exception:") - # allow successful completion so gevent doesn't see an exception... - - def log(self, x): - """Write data to the logger assigned to ForwardScrubber""" - self.logger.info(x) - - def stop(self): - self.stopping.set() - self.lock.acquire() - try: - self.log("killing all scrubbers") - killall(self.scrubbers) - finally: - self.lock.release() - - def do_scrub(self): - """ - Perform the file-system scrubbing - """ - self.log(f'starting do_scrub for fs: {self.fs.name}') - - try: - while not self.stopping.is_set(): - ranks = self.fs.get_all_mds_rank() - - for r in ranks: - scrubber = MDSRankScrubber(self.fs, r, self.scrub_timeout) - self.lock.acquire() - try: - self.scrubbers.append(scrubber) - finally: - self.lock.release() - scrubber.start() - - # wait for all scrubbers to complete - self.log("joining all scrubbers") - joinall(self.scrubbers) - - for s in self.scrubbers: - if s.exception is not None: - raise RuntimeError('error during scrub thrashing') - - self.lock.acquire() - try: - self.scrubbers.clear() - finally: - self.lock.release() - - sleep(self.sleep_between_iterations) - except GreenletExit: - pass - - def stop_all_fwd_scrubbers(thrashers): for thrasher in thrashers: if not isinstance(thrasher, ForwardScrubber): continue thrasher.stop() + thrasher.join() if thrasher.exception is not None: raise RuntimeError(f"error during scrub thrashing: {thrasher.exception}") - thrasher.join() @contextlib.contextmanager diff --git a/src/client/Client.cc b/src/client/Client.cc index d6ed10ab425..9436862cff3 100644 --- a/src/client/Client.cc +++ b/src/client/Client.cc @@ -5794,14 +5794,14 @@ int Client::resolve_mds( ceph_assert(targets != nullptr); mds_role_t role; - std::stringstream ss; - int role_r = fsmap->parse_role(mds_spec, &role, ss); + CachedStackStringStream css; + int role_r = fsmap->parse_role(mds_spec, &role, *css); if (role_r == 0) { // We got a role, resolve it to a GID - ldout(cct, 10) << __func__ << ": resolved '" << mds_spec << "' to role '" - << role << "'" << dendl; - targets->push_back( - fsmap->get_filesystem(role.fscid)->mds_map.get_info(role.rank).global_id); + auto& info = fsmap->get_filesystem(role.fscid)->mds_map.get_info(role.rank); + ldout(cct, 10) << __func__ << ": resolved " << mds_spec << " to role '" + << role << "' aka " << info.human_name() << dendl; + targets->push_back(info.global_id); return 0; } @@ -5811,42 +5811,48 @@ int Client::resolve_mds( // It is a possible GID const mds_gid_t mds_gid = mds_gid_t(rank_or_gid); if (fsmap->gid_exists(mds_gid)) { - ldout(cct, 10) << __func__ << ": validated GID " << mds_gid << dendl; + auto& info = fsmap->get_info_gid(mds_gid); + ldout(cct, 10) << __func__ << ": validated gid " << mds_gid << " aka " + << info.human_name() << dendl; targets->push_back(mds_gid); + return 0; } else { - lderr(cct) << __func__ << ": GID " << mds_gid << " not in MDS map" + lderr(cct) << __func__ << ": gid " << mds_gid << " not in MDS map" << dendl; + lderr(cct) << "FSMap: " << *fsmap << dendl; return -ENOENT; } } else if (mds_spec == "*") { // It is a wildcard: use all MDSs - const auto mds_info = fsmap->get_mds_info(); + const auto& mds_info = fsmap->get_mds_info(); + ldout(cct, 10) << __func__ << ": resolving `*' to all MDS daemons" << dendl; if (mds_info.empty()) { - lderr(cct) << __func__ << ": * passed but no MDS daemons found" << dendl; + lderr(cct) << __func__ << ": no MDS daemons found" << dendl; + lderr(cct) << "FSMap: " << *fsmap << dendl; return -ENOENT; } - for (const auto& i : mds_info) { - targets->push_back(i.first); + for (const auto& [gid, info] : mds_info) { + ldout(cct, 10) << __func__ << ": appending " << info.human_name() << " to targets" << dendl; + targets->push_back(gid); } + return 0; } else { // It did not parse as an integer, it is not a wildcard, it must be a name const mds_gid_t mds_gid = fsmap->find_mds_gid_by_name(mds_spec); if (mds_gid == 0) { - lderr(cct) << "MDS ID '" << mds_spec << "' not found" << dendl; - + lderr(cct) << __func__ << ": no MDS daemons found by name `" << mds_spec << "'" << dendl; lderr(cct) << "FSMap: " << *fsmap << dendl; - return -ENOENT; } else { - ldout(cct, 10) << __func__ << ": resolved ID '" << mds_spec - << "' to GID " << mds_gid << dendl; + auto& info = fsmap->get_info_gid(mds_gid); + ldout(cct, 10) << __func__ << ": resolved name '" << mds_spec + << "' to " << info.human_name() << dendl; targets->push_back(mds_gid); } + return 0; } - - return 0; }