mirror of
https://github.com/ceph/ceph
synced 2025-02-20 09:27:35 +00:00
osd: add osd_fast_shutdown option (default true)
If we get a SIGINT or SIGTERM or are deleted from the OSDMap, do a fast shutdown by exiting immediately. This has a few important benefits: - We immediately stop responding (binding) to any sockets, which means other OSDs will immediately decide we are down (and dead!). This minimizes IO interruption. - We avoid the complex "clean" shutdown process, which is historically a source of bugs. In reality, the only purpose of the "clean" shutdown is to try to tear down everything in memory so we can do memory leak checking with valgrind. Set this option to false for valgrind QA runs so we can still do that. Not that with the new read leases in octopus, we rely on the default behavior that a ECONNREFUSED is taken to mean that the OSD is fully dead, so that we don't have to wait for any leases to time out. This works in sane environments with normal IP networks, but that behavior could conceivably be a bad idea if there are some weird network shenanigans going on. If osd_fast_fail_on_connection_refused were disabled, then this fast shutdown procedure might be *worse* than the clean shutdown because we would have to wait for the heartbeat timeout. Signed-off-by: Sage Weil <sage@redhat.com>
This commit is contained in:
parent
fc2bb6ed47
commit
cf352c3ac0
@ -17,6 +17,8 @@ overrides:
|
||||
mds heartbeat grace: 60
|
||||
mon:
|
||||
mon osd crush smoke test: false
|
||||
osd:
|
||||
osd fast shutdown: false
|
||||
valgrind:
|
||||
mon: [--tool=memcheck, --leak-check=full, --show-reachable=yes]
|
||||
osd: [--tool=memcheck]
|
||||
|
@ -23,6 +23,8 @@ overrides:
|
||||
osd max object namespace len: 64
|
||||
mon:
|
||||
mon osd crush smoke test: false
|
||||
osd:
|
||||
osd fast shutdown: false
|
||||
valgrind:
|
||||
mon: [--tool=memcheck, --leak-check=full, --show-reachable=yes]
|
||||
osd: [--tool=memcheck]
|
||||
|
@ -13,6 +13,8 @@ overrides:
|
||||
debug refs: 5
|
||||
mon:
|
||||
mon osd crush smoke test: false
|
||||
osd:
|
||||
osd fast shutdown: false
|
||||
log-whitelist:
|
||||
- overall HEALTH_
|
||||
# valgrind is slow.. we might get PGs stuck peering etc
|
||||
|
@ -11,6 +11,8 @@ overrides:
|
||||
osd heartbeat grace: 40
|
||||
mon:
|
||||
mon osd crush smoke test: false
|
||||
osd:
|
||||
osd fast shutdown: false
|
||||
valgrind:
|
||||
mon: [--tool=memcheck, --leak-check=full, --show-reachable=yes]
|
||||
osd: [--tool=memcheck]
|
||||
|
@ -12,6 +12,8 @@ overrides:
|
||||
osd heartbeat grace: 40
|
||||
mon:
|
||||
mon osd crush smoke test: false
|
||||
osd:
|
||||
osd fast shutdown: false
|
||||
valgrind:
|
||||
mon: [--tool=memcheck, --leak-check=full, --show-reachable=yes]
|
||||
osd: [--tool=memcheck]
|
||||
|
@ -764,6 +764,7 @@ OPTION(osd_op_history_slow_op_size, OPT_U32) // Max number of slow ops
|
||||
OPTION(osd_op_history_slow_op_threshold, OPT_DOUBLE) // track the op if over this threshold
|
||||
OPTION(osd_target_transaction_size, OPT_INT) // to adjust various transactions that batch smaller items
|
||||
OPTION(osd_failsafe_full_ratio, OPT_FLOAT) // what % full makes an OSD "full" (failsafe)
|
||||
OPTION(osd_fast_shutdown, OPT_BOOL)
|
||||
OPTION(osd_fast_fail_on_connection_refused, OPT_BOOL) // immediately mark OSDs as down once they refuse to accept connections
|
||||
|
||||
OPTION(osd_pg_object_context_cache_count, OPT_INT)
|
||||
|
@ -3498,6 +3498,11 @@ std::vector<Option> get_global_options() {
|
||||
.set_default(.97)
|
||||
.set_description(""),
|
||||
|
||||
Option("osd_fast_shutdown", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
|
||||
.set_default(true)
|
||||
.set_description("Fast, immediate shutdown")
|
||||
.set_long_description("Setting this to false makes the OSD do a slower teardown of all state when it receives a SIGINT or SIGTERM or when shutting down for any other reason. That slow shutdown is primarilyy useful for doing memory leak checking with valgrind."),
|
||||
|
||||
Option("osd_fast_fail_on_connection_refused", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
|
||||
.set_default(true)
|
||||
.set_description(""),
|
||||
|
@ -4016,6 +4016,12 @@ void OSD::create_recoverystate_perf()
|
||||
|
||||
int OSD::shutdown()
|
||||
{
|
||||
if (cct->_conf->osd_fast_shutdown) {
|
||||
derr << "*** Immediate shutdown (osd_fast_shutdown=true) ***" << dendl;
|
||||
cct->_log->flush();
|
||||
_exit(0);
|
||||
}
|
||||
|
||||
if (!service.prepare_to_stop())
|
||||
return 0; // already shutting down
|
||||
osd_lock.lock();
|
||||
|
@ -710,6 +710,7 @@ $DAEMONOPTS
|
||||
osd class dir = $OBJCLASS_PATH
|
||||
osd class load list = *
|
||||
osd class default list = *
|
||||
osd fast shutdown = false
|
||||
|
||||
filestore wbthrottle xfs ios start flusher = 10
|
||||
filestore wbthrottle xfs ios hard limit = 20
|
||||
|
Loading…
Reference in New Issue
Block a user