mirror of
https://github.com/ceph/ceph
synced 2024-12-17 17:05:42 +00:00
Merge pull request #15308 from jcsp/wip-19706
mon: don't kill MDSs unless some beacons are getting through Reviewed-by: Patrick Donnelly <pdonnell@redhat.com>
This commit is contained in:
commit
18fbf24c7a
@ -1,4 +1,7 @@
|
||||
overrides:
|
||||
ceph:
|
||||
log-whitelist:
|
||||
- not responding, replacing
|
||||
ceph-fuse:
|
||||
disabled: true
|
||||
tasks:
|
||||
|
@ -1,2 +1,7 @@
|
||||
tasks:
|
||||
- mds_thrash:
|
||||
|
||||
overrides:
|
||||
ceph:
|
||||
log-whitelist:
|
||||
- not responding, replacing
|
||||
|
@ -2,3 +2,8 @@ tasks:
|
||||
- install:
|
||||
- ceph:
|
||||
- mds_thrash:
|
||||
|
||||
overrides:
|
||||
ceph:
|
||||
log-whitelist:
|
||||
- not responding, replacing
|
||||
|
@ -102,6 +102,7 @@ class CephFSTestCase(CephTestCase):
|
||||
# To avoid any issues with e.g. unlink bugs, we destroy and recreate
|
||||
# the filesystem rather than just doing a rm -rf of files
|
||||
self.mds_cluster.mds_stop()
|
||||
self.mds_cluster.mds_fail()
|
||||
self.mds_cluster.delete_all_filesystems()
|
||||
self.fs = None # is now invalid!
|
||||
|
||||
|
@ -503,6 +503,7 @@ bool MDSMonitor::prepare_beacon(MonOpRequestRef op)
|
||||
mon->osdmon()->wait_for_writeable(op, new C_RetryMessage(this, op));
|
||||
return false;
|
||||
}
|
||||
mon->clog->info() << "MDS daemon '" << m->get_name() << "' restarted";
|
||||
fail_mds_gid(existing);
|
||||
failed_mds = true;
|
||||
}
|
||||
@ -1920,12 +1921,24 @@ void MDSMonitor::maybe_replace_gid(mds_gid_t gid,
|
||||
<< " " << ceph_mds_state_name(info.state)
|
||||
<< " since " << beacon.stamp << dendl;
|
||||
|
||||
// We will only take decisive action (replacing/removing a daemon)
|
||||
// if we have some indicating that some other daemon(s) are successfully
|
||||
// getting beacons through recently.
|
||||
utime_t latest_beacon;
|
||||
for (const auto & i : last_beacon) {
|
||||
latest_beacon = MAX(i.second.stamp, latest_beacon);
|
||||
}
|
||||
const bool may_replace = latest_beacon >
|
||||
(ceph_clock_now() -
|
||||
MAX(g_conf->mds_beacon_interval, g_conf->mds_beacon_grace * 0.5));
|
||||
|
||||
// are we in?
|
||||
// and is there a non-laggy standby that can take over for us?
|
||||
mds_gid_t sgid;
|
||||
if (info.rank >= 0 &&
|
||||
info.state != MDSMap::STATE_STANDBY &&
|
||||
info.state != MDSMap::STATE_STANDBY_REPLAY &&
|
||||
may_replace &&
|
||||
!pending_fsmap.get_filesystem(fscid)->mds_map.test_flag(CEPH_MDSMAP_DOWN) &&
|
||||
(sgid = pending_fsmap.find_replacement_for({fscid, info.rank}, info.name,
|
||||
g_conf->mon_force_standby_active)) != MDS_GID_NONE)
|
||||
@ -1937,6 +1950,11 @@ void MDSMonitor::maybe_replace_gid(mds_gid_t gid,
|
||||
<< " " << ceph_mds_state_name(info.state)
|
||||
<< " with " << sgid << "/" << si.name << " " << si.addr << dendl;
|
||||
|
||||
mon->clog->warn() << "MDS daemon '" << info.name << "'"
|
||||
<< " is not responding, replacing it "
|
||||
<< "as rank " << info.rank
|
||||
<< " with standby '" << si.name << "'";
|
||||
|
||||
// Remember what NS the old one was in
|
||||
const fs_cluster_id_t fscid = pending_fsmap.mds_roles.at(gid);
|
||||
|
||||
@ -1948,11 +1966,14 @@ void MDSMonitor::maybe_replace_gid(mds_gid_t gid,
|
||||
pending_fsmap.promote(sgid, fs, info.rank);
|
||||
|
||||
*mds_propose = true;
|
||||
} else if (info.state == MDSMap::STATE_STANDBY_REPLAY ||
|
||||
info.state == MDSMap::STATE_STANDBY) {
|
||||
} else if ((info.state == MDSMap::STATE_STANDBY_REPLAY ||
|
||||
info.state == MDSMap::STATE_STANDBY) && may_replace) {
|
||||
dout(10) << " failing and removing " << gid << " " << info.addr << " mds." << info.rank
|
||||
<< "." << info.inc << " " << ceph_mds_state_name(info.state)
|
||||
<< dendl;
|
||||
mon->clog->info() << "MDS standby '" << info.name
|
||||
<< "' is not responding, removing it from the set of "
|
||||
<< "standbys";
|
||||
fail_mds_gid(gid);
|
||||
*mds_propose = true;
|
||||
} else if (!info.laggy()) {
|
||||
|
Loading…
Reference in New Issue
Block a user