From 4c49f165ec1a5fd6a6da42606bd3a53b800c0a5f Mon Sep 17 00:00:00 2001 From: Patrick Donnelly Date: Tue, 15 Jan 2019 13:39:09 -0800 Subject: [PATCH] MDSMonitor: add fs fail command This command sets the fs as not joinable and fails all ranks. This is a simpler command than the typical sequence: (a) set fs not joinable; (b) iterate through and fail ranks. It also does this in a single FSMap update. Fixes: http://tracker.ceph.com/issues/37085 Signed-off-by: Patrick Donnelly --- PendingReleaseNotes | 4 +++ doc/cephfs/administration.rst | 20 ++++++++++-- src/mon/FSCommands.cc | 58 ++++++++++++++++++++++++++++++++++- src/mon/MonCommands.h | 4 +++ 4 files changed, 82 insertions(+), 4 deletions(-) diff --git a/PendingReleaseNotes b/PendingReleaseNotes index 7800b9d219f..39f27354441 100644 --- a/PendingReleaseNotes +++ b/PendingReleaseNotes @@ -260,6 +260,10 @@ specified in the ``[global]`` section to allow daemons and clients to discover the monitors. +* New command `fs fail` has been added to quickly bring down a file + system. This is a single command that unsets the joinable flag on the file + system and brings down all of its ranks. + Upgrading from Luminous ----------------------- diff --git a/doc/cephfs/administration.rst b/doc/cephfs/administration.rst index 289e668bccf..3f70c8c75a2 100644 --- a/doc/cephfs/administration.rst +++ b/doc/cephfs/administration.rst @@ -98,9 +98,17 @@ client I/O is stopped. Taking the cluster down rapidly for deletion or disaster recovery ----------------------------------------------------------------- -To allow rapidly deleting a file system (for testing) or to quickly bring MDS -daemons down, the operator may also set a flag to prevent standbys from -activating on the file system. This is done using the ``joinable`` flag: +To allow rapidly deleting a file system (for testing) or to quickly bring the +file system and MDS daemons down, use the ``fs fail`` command: + +:: + + fs fail + +This command sets a file system flag to prevent standbys from +activating on the file system (the ``joinable`` flag). + +This process can also be done manually by doing the following: :: @@ -117,6 +125,12 @@ respawn as standbys. The file system will be left in a degraded state. Once all ranks are inactive, the file system may also be deleted or left in this state for other purposes (perhaps disaster recovery). +To bring the cluster back up, simply set the joinable flag: + +:: + + fs set joinable true + Daemons ------- diff --git a/src/mon/FSCommands.cc b/src/mon/FSCommands.cc index f1fe9eab8ae..a840e815894 100644 --- a/src/mon/FSCommands.cc +++ b/src/mon/FSCommands.cc @@ -76,6 +76,61 @@ class FlagSetHandler : public FileSystemCommandHandler } }; +class FailHandler : public FileSystemCommandHandler +{ + public: + FailHandler() + : FileSystemCommandHandler("fs fail") + { + } + + int handle( + Monitor* mon, + FSMap& fsmap, + MonOpRequestRef op, + const cmdmap_t& cmdmap, + std::stringstream& ss) override + { + if (!mon->osdmon()->is_writeable()) { + // not allowed to write yet, so retry when we can + mon->osdmon()->wait_for_writeable(op, new PaxosService::C_RetryMessage(mon->mdsmon(), op)); + return -EAGAIN; + } + + std::string fs_name; + if (!cmd_getval(g_ceph_context, cmdmap, "fs_name", fs_name) || fs_name.empty()) { + ss << "Missing filesystem name"; + return -EINVAL; + } + + auto fs = fsmap.get_filesystem(fs_name); + if (fs == nullptr) { + ss << "Not found: '" << fs_name << "'"; + return -ENOENT; + } + + auto f = [](auto fs) { + fs->mds_map.set_flag(CEPH_MDSMAP_NOT_JOINABLE); + }; + fsmap.modify_filesystem(fs->fscid, std::move(f)); + + std::vector to_fail; + for (const auto& p : fs->mds_map.get_mds_info()) { + to_fail.push_back(p.first); + } + + for (const auto& gid : to_fail) { + mon->mdsmon()->fail_mds_gid(fsmap, gid); + } + mon->osdmon()->propose_pending(); + + ss << fs_name; + ss << " marked not joinable; MDS cannot join the cluster. All MDS ranks marked failed."; + + return 0; + } +}; + class FsNewHandler : public FileSystemCommandHandler { public: @@ -691,7 +746,7 @@ class RemoveFilesystemHandler : public FileSystemCommandHandler // Check that no MDS daemons are active if (fs->mds_map.get_num_up_mds() > 0) { - ss << "all MDS daemons must be inactive before removing filesystem"; + ss << "all MDS daemons must be inactive/failed before removing filesystem. See `ceph fs fail`."; return -EINVAL; } @@ -878,6 +933,7 @@ FileSystemCommandHandler::load(Paxos *paxos) std::list > handlers; handlers.push_back(std::make_shared()); + handlers.push_back(std::make_shared()); handlers.push_back(std::make_shared()); handlers.push_back(std::make_shared(paxos)); handlers.push_back(std::make_shared()); diff --git a/src/mon/MonCommands.h b/src/mon/MonCommands.h index f5ce691fe2c..741a2b28f4e 100644 --- a/src/mon/MonCommands.h +++ b/src/mon/MonCommands.h @@ -371,6 +371,10 @@ COMMAND("fs new " \ "name=allow_dangerous_metadata_overlay,type=CephBool,req=false", \ "make new filesystem using named pools and ", \ "fs", "rw") +COMMAND("fs fail " \ + "name=fs_name,type=CephString ", \ + "bring the file system down and all of its ranks", \ + "fs", "rw") COMMAND("fs rm " \ "name=fs_name,type=CephString " \ "name=yes_i_really_mean_it,type=CephBool,req=false", \