MDSMonitor: add fs fail command

This command sets the fs as not joinable and fails all ranks. This is a simpler
command than the typical sequence: (a) set fs not joinable; (b) iterate through
and fail ranks. It also does this in a single FSMap update.

Fixes: http://tracker.ceph.com/issues/37085

Signed-off-by: Patrick Donnelly <pdonnell@redhat.com>
This commit is contained in:
Patrick Donnelly 2019-01-15 13:39:09 -08:00
parent 38a99f04f4
commit 4c49f165ec
No known key found for this signature in database
GPG Key ID: 3A2A7E25BEA8AADB
4 changed files with 82 additions and 4 deletions

View File

@ -260,6 +260,10 @@
specified in the ``[global]`` section to allow daemons and clients
to discover the monitors.
* New command `fs fail` has been added to quickly bring down a file
system. This is a single command that unsets the joinable flag on the file
system and brings down all of its ranks.
Upgrading from Luminous
-----------------------

View File

@ -98,9 +98,17 @@ client I/O is stopped.
Taking the cluster down rapidly for deletion or disaster recovery
-----------------------------------------------------------------
To allow rapidly deleting a file system (for testing) or to quickly bring MDS
daemons down, the operator may also set a flag to prevent standbys from
activating on the file system. This is done using the ``joinable`` flag:
To allow rapidly deleting a file system (for testing) or to quickly bring the
file system and MDS daemons down, use the ``fs fail`` command:
::
fs fail <fs_name>
This command sets a file system flag to prevent standbys from
activating on the file system (the ``joinable`` flag).
This process can also be done manually by doing the following:
::
@ -117,6 +125,12 @@ respawn as standbys. The file system will be left in a degraded state.
Once all ranks are inactive, the file system may also be deleted or left in
this state for other purposes (perhaps disaster recovery).
To bring the cluster back up, simply set the joinable flag:
::
fs set <fs_name> joinable true
Daemons
-------

View File

@ -76,6 +76,61 @@ class FlagSetHandler : public FileSystemCommandHandler
}
};
class FailHandler : public FileSystemCommandHandler
{
public:
FailHandler()
: FileSystemCommandHandler("fs fail")
{
}
int handle(
Monitor* mon,
FSMap& fsmap,
MonOpRequestRef op,
const cmdmap_t& cmdmap,
std::stringstream& ss) override
{
if (!mon->osdmon()->is_writeable()) {
// not allowed to write yet, so retry when we can
mon->osdmon()->wait_for_writeable(op, new PaxosService::C_RetryMessage(mon->mdsmon(), op));
return -EAGAIN;
}
std::string fs_name;
if (!cmd_getval(g_ceph_context, cmdmap, "fs_name", fs_name) || fs_name.empty()) {
ss << "Missing filesystem name";
return -EINVAL;
}
auto fs = fsmap.get_filesystem(fs_name);
if (fs == nullptr) {
ss << "Not found: '" << fs_name << "'";
return -ENOENT;
}
auto f = [](auto fs) {
fs->mds_map.set_flag(CEPH_MDSMAP_NOT_JOINABLE);
};
fsmap.modify_filesystem(fs->fscid, std::move(f));
std::vector<mds_gid_t> to_fail;
for (const auto& p : fs->mds_map.get_mds_info()) {
to_fail.push_back(p.first);
}
for (const auto& gid : to_fail) {
mon->mdsmon()->fail_mds_gid(fsmap, gid);
}
mon->osdmon()->propose_pending();
ss << fs_name;
ss << " marked not joinable; MDS cannot join the cluster. All MDS ranks marked failed.";
return 0;
}
};
class FsNewHandler : public FileSystemCommandHandler
{
public:
@ -691,7 +746,7 @@ class RemoveFilesystemHandler : public FileSystemCommandHandler
// Check that no MDS daemons are active
if (fs->mds_map.get_num_up_mds() > 0) {
ss << "all MDS daemons must be inactive before removing filesystem";
ss << "all MDS daemons must be inactive/failed before removing filesystem. See `ceph fs fail`.";
return -EINVAL;
}
@ -878,6 +933,7 @@ FileSystemCommandHandler::load(Paxos *paxos)
std::list<std::shared_ptr<FileSystemCommandHandler> > handlers;
handlers.push_back(std::make_shared<SetHandler>());
handlers.push_back(std::make_shared<FailHandler>());
handlers.push_back(std::make_shared<FlagSetHandler>());
handlers.push_back(std::make_shared<AddDataPoolHandler>(paxos));
handlers.push_back(std::make_shared<RemoveDataPoolHandler>());

View File

@ -371,6 +371,10 @@ COMMAND("fs new " \
"name=allow_dangerous_metadata_overlay,type=CephBool,req=false", \
"make new filesystem using named pools <metadata> and <data>", \
"fs", "rw")
COMMAND("fs fail " \
"name=fs_name,type=CephString ", \
"bring the file system down and all of its ranks", \
"fs", "rw")
COMMAND("fs rm " \
"name=fs_name,type=CephString " \
"name=yes_i_really_mean_it,type=CephBool,req=false", \