From 1e7d227bfc511d53efbe96c52bcd5af9c7cc08b4 Mon Sep 17 00:00:00 2001 From: David Zafman Date: Tue, 28 Feb 2017 12:28:38 -0800 Subject: [PATCH 01/24] osd: Fix log message Signed-off-by: David Zafman --- src/os/filestore/FileStore.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/os/filestore/FileStore.cc b/src/os/filestore/FileStore.cc index 475d7f59481..3b0eabd12d5 100644 --- a/src/os/filestore/FileStore.cc +++ b/src/os/filestore/FileStore.cc @@ -3952,7 +3952,7 @@ void FileStore::sync_entry() derr << "ioctl WAIT_SYNC got " << cpp_strerror(err) << dendl; assert(0 == "wait_sync got error"); } - dout(20) << " done waiting for checkpoint" << cid << " to complete" << dendl; + dout(20) << " done waiting for checkpoint " << cid << " to complete" << dendl; } } else { From 3a66f1fbf606157ea3f3ae96e2e9577a54a60886 Mon Sep 17 00:00:00 2001 From: David Zafman Date: Mon, 3 Apr 2017 11:28:42 -0700 Subject: [PATCH 02/24] ceph-objectstore-tool: cleanup comment Signed-off-by: David Zafman --- src/tools/ceph_objectstore_tool.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/tools/ceph_objectstore_tool.cc b/src/tools/ceph_objectstore_tool.cc index 6b173f204d0..f9bf85a5d63 100644 --- a/src/tools/ceph_objectstore_tool.cc +++ b/src/tools/ceph_objectstore_tool.cc @@ -2906,7 +2906,7 @@ int main(int argc, char **argv) throw std::runtime_error(ss.str()); } vector::iterator i = array.begin(); - //if (i == array.end() || i->type() != json_spirit::str_type) { + assert(i != array.end()); if (i->type() != json_spirit::str_type) { ss << "Object '" << object << "' must be a JSON array with the first element a string"; From 6c930f7e661de92533fac9b9bdd197f07b67ff3f Mon Sep 17 00:00:00 2001 From: David Zafman Date: Thu, 16 Mar 2017 10:40:08 -0700 Subject: [PATCH 03/24] osd: Increase osd_backfill_retry_interval to 30 seconds Signed-off-by: David Zafman --- src/common/config_opts.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/common/config_opts.h b/src/common/config_opts.h index ae6fdf2ccb3..974ee7efd33 100644 --- a/src/common/config_opts.h +++ b/src/common/config_opts.h @@ -630,7 +630,7 @@ OPTION(osd_min_recovery_priority, OPT_INT, 0) OPTION(osd_backfill_full_ratio, OPT_FLOAT, 0.85) // Seconds to wait before retrying refused backfills -OPTION(osd_backfill_retry_interval, OPT_DOUBLE, 10.0) +OPTION(osd_backfill_retry_interval, OPT_DOUBLE, 30.0) // max agent flush ops OPTION(osd_agent_max_ops, OPT_INT, 4) From d024ab05049f39741d05f9d202370b3d3ebfc731 Mon Sep 17 00:00:00 2001 From: David Zafman Date: Thu, 16 Mar 2017 10:27:31 -0700 Subject: [PATCH 04/24] osd: Remove unused argument to clear_queued_recovery Signed-off-by: David Zafman --- src/osd/OSD.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/osd/OSD.h b/src/osd/OSD.h index 73998a100e5..fc60df7c65f 100644 --- a/src/osd/OSD.h +++ b/src/osd/OSD.h @@ -1025,7 +1025,7 @@ public: Mutex::Locker l(recovery_lock); _maybe_queue_recovery(); } - void clear_queued_recovery(PG *pg, bool front = false) { + void clear_queued_recovery(PG *pg) { Mutex::Locker l(recovery_lock); for (list >::iterator i = awaiting_throttle.begin(); i != awaiting_throttle.end(); From 811f89a682e831f98d602c9887034cef517a009a Mon Sep 17 00:00:00 2001 From: David Zafman Date: Fri, 7 Apr 2017 09:36:26 -0700 Subject: [PATCH 05/24] test: Switch from pg to osd for set-*-ratio commands Testing of 6422e0a220fb3f32ccae50e0c7e52dc9984685c6 Signed-off-by: David Zafman --- src/test/pybind/test_ceph_argparse.py | 30 +++++++++++++-------------- 1 file changed, 15 insertions(+), 15 deletions(-) diff --git a/src/test/pybind/test_ceph_argparse.py b/src/test/pybind/test_ceph_argparse.py index 0608655b587..e9694064bd2 100755 --- a/src/test/pybind/test_ceph_argparse.py +++ b/src/test/pybind/test_ceph_argparse.py @@ -183,21 +183,6 @@ class TestPG(TestArgparse): def test_force_create_pg(self): self.one_pgid('force_create_pg') - def set_ratio(self, command): - self.assert_valid_command(['pg', - command, - '0.0']) - assert_equal({}, validate_command(sigdict, ['pg', command])) - assert_equal({}, validate_command(sigdict, ['pg', - command, - '2.0'])) - - def test_set_full_ratio(self): - self.set_ratio('set_full_ratio') - - def test_set_nearfull_ratio(self): - self.set_ratio('set_nearfull_ratio') - class TestAuth(TestArgparse): @@ -1153,6 +1138,21 @@ class TestOSD(TestArgparse): 'poolname', 'toomany'])) + def set_ratio(self, command): + self.assert_valid_command(['osd', + command, + '0.0']) + assert_equal({}, validate_command(sigdict, ['osd', command])) + assert_equal({}, validate_command(sigdict, ['osd', + command, + '2.0'])) + + def test_set_full_ratio(self): + self.set_ratio('set-full-ratio') + + def test_set_nearfull_ratio(self): + self.set_ratio('set-nearfull-ratio') + class TestConfigKey(TestArgparse): From e927cd2cf23d97e0d49647f79acd2f3a95bd43c3 Mon Sep 17 00:00:00 2001 From: David Zafman Date: Fri, 7 Apr 2017 11:52:57 -0700 Subject: [PATCH 06/24] test: Fix intended test flow and restore nearfull-ratio This is inconsequential but seems to have always been wrong since original commit 6cafb0e3e0bc5f992c2483a46cb00e83dca035cc Signed-off-by: David Zafman --- qa/workunits/rest/test.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/qa/workunits/rest/test.py b/qa/workunits/rest/test.py index 1208f85b907..1a7ab30b886 100755 --- a/qa/workunits/rest/test.py +++ b/qa/workunits/rest/test.py @@ -362,7 +362,7 @@ if __name__ == '__main__': expect('osd/set-nearfull-ratio?ratio=0.90', 'PUT', 200, '') r = expect('osd/dump', 'GET', 200, 'json', JSONHDR) assert(float(r.myjson['output']['nearfull_ratio']) == 0.90) - expect('osd/set-full-ratio?ratio=0.85', 'PUT', 200, '') + expect('osd/set-nearfull-ratio?ratio=0.85', 'PUT', 200, '') r = expect('pg/stat', 'GET', 200, 'json', JSONHDR) assert('num_pgs' in r.myjson['output']) From 5baf7abfa36f7fb2a174c5255b0d4afc65df6388 Mon Sep 17 00:00:00 2001 From: David Zafman Date: Thu, 30 Mar 2017 10:30:29 -0700 Subject: [PATCH 07/24] osd: Fail-safe full is a hard stop even for mds We can't allow OSD to become non-startable even if mds could be writing as part of file removals. Signed-off-by: David Zafman --- src/osd/PrimaryLogPG.cc | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/osd/PrimaryLogPG.cc b/src/osd/PrimaryLogPG.cc index 2f1599963c9..5fd0463401f 100644 --- a/src/osd/PrimaryLogPG.cc +++ b/src/osd/PrimaryLogPG.cc @@ -1888,7 +1888,10 @@ void PrimaryLogPG::do_op(OpRequestRef& op) << *m << dendl; return; } - if (!(m->get_source().is_mds()) && osd->check_failsafe_full() && write_ordered) { + // mds should have stopped writing before this point. + // We can't allow OSD to become non-startable even if mds + // could be writing as part of file removals. + if (write_ordered && osd->check_failsafe_full()) { dout(10) << __func__ << " fail-safe full check failed, dropping request" << dendl; return; From 79124330c7e8b377125dd52e12275cf81b3c38e6 Mon Sep 17 00:00:00 2001 From: David Zafman Date: Thu, 30 Mar 2017 11:17:13 -0700 Subject: [PATCH 08/24] osd: too_full_for_backfill() returns ostream for reason Signed-off-by: David Zafman --- src/osd/OSD.cc | 7 ++----- src/osd/OSD.h | 2 +- src/osd/PG.cc | 18 ++++++++---------- src/osd/PrimaryLogPG.cc | 7 +++---- 4 files changed, 14 insertions(+), 20 deletions(-) diff --git a/src/osd/OSD.cc b/src/osd/OSD.cc index 7b1a021a228..167c0c1ff3a 100644 --- a/src/osd/OSD.cc +++ b/src/osd/OSD.cc @@ -823,15 +823,12 @@ bool OSDService::is_full() return cur_state >= FULL; } -bool OSDService::too_full_for_backfill(double *_ratio, double *_max_ratio) +bool OSDService::too_full_for_backfill(ostream &ss) { Mutex::Locker l(full_status_lock); double max_ratio; max_ratio = cct->_conf->osd_backfill_full_ratio; - if (_ratio) - *_ratio = cur_ratio; - if (_max_ratio) - *_max_ratio = max_ratio; + ss << "current usage is " << cur_ratio << ", which is greater than max allowed ratio " << max_ratio; return cur_ratio >= max_ratio; } diff --git a/src/osd/OSD.h b/src/osd/OSD.h index fc60df7c65f..e593d552a6b 100644 --- a/src/osd/OSD.h +++ b/src/osd/OSD.h @@ -1155,7 +1155,7 @@ public: bool check_failsafe_full(); bool is_nearfull(); bool is_full(); - bool too_full_for_backfill(double *ratio, double *max_ratio); + bool too_full_for_backfill(ostream &ss); bool need_fullness_update(); ///< osdmap state needs update diff --git a/src/osd/PG.cc b/src/osd/PG.cc index 56b9c19bed8..7cee9141820 100644 --- a/src/osd/PG.cc +++ b/src/osd/PG.cc @@ -6554,18 +6554,17 @@ boost::statechart::result PG::RecoveryState::RepNotRecovering::react(const RequestBackfillPrio &evt) { PG *pg = context< RecoveryMachine >().pg; - double ratio, max_ratio; + ostringstream ss; if (pg->cct->_conf->osd_debug_reject_backfill_probability > 0 && (rand()%1000 < (pg->cct->_conf->osd_debug_reject_backfill_probability*1000.0))) { ldout(pg->cct, 10) << "backfill reservation rejected: failure injection" << dendl; post_event(RemoteReservationRejected()); - } else if (pg->osd->too_full_for_backfill(&ratio, &max_ratio) && + } else if (pg->osd->too_full_for_backfill(ss) && !pg->cct->_conf->osd_debug_skip_full_check_in_backfill_reservation) { - ldout(pg->cct, 10) << "backfill reservation rejected: full ratio is " - << ratio << ", which is greater than max allowed ratio " - << max_ratio << dendl; + ldout(pg->cct, 10) << "backfill reservation rejected: " + << ss.str() << dendl; post_event(RemoteReservationRejected()); } else { pg->osd->remote_reserver.request_reservation( @@ -6590,7 +6589,7 @@ PG::RecoveryState::RepWaitBackfillReserved::react(const RemoteBackfillReserved & { PG *pg = context< RecoveryMachine >().pg; - double ratio, max_ratio; + ostringstream ss; if (pg->cct->_conf->osd_debug_reject_backfill_probability > 0 && (rand()%1000 < (pg->cct->_conf->osd_debug_reject_backfill_probability*1000.0))) { ldout(pg->cct, 10) << "backfill reservation rejected after reservation: " @@ -6598,11 +6597,10 @@ PG::RecoveryState::RepWaitBackfillReserved::react(const RemoteBackfillReserved & pg->osd->remote_reserver.cancel_reservation(pg->info.pgid); post_event(RemoteReservationRejected()); return discard_event(); - } else if (pg->osd->too_full_for_backfill(&ratio, &max_ratio) && + } else if (pg->osd->too_full_for_backfill(ss) && !pg->cct->_conf->osd_debug_skip_full_check_in_backfill_reservation) { - ldout(pg->cct, 10) << "backfill reservation rejected after reservation: full ratio is " - << ratio << ", which is greater than max allowed ratio " - << max_ratio << dendl; + ldout(pg->cct, 10) << "backfill reservation rejected after reservation: " + << ss.str() << dendl; pg->osd->remote_reserver.cancel_reservation(pg->info.pgid); post_event(RemoteReservationRejected()); return discard_event(); diff --git a/src/osd/PrimaryLogPG.cc b/src/osd/PrimaryLogPG.cc index 5fd0463401f..553ffca191e 100644 --- a/src/osd/PrimaryLogPG.cc +++ b/src/osd/PrimaryLogPG.cc @@ -3331,10 +3331,9 @@ void PrimaryLogPG::do_scan( switch (m->op) { case MOSDPGScan::OP_SCAN_GET_DIGEST: { - double ratio, full_ratio; - if (osd->too_full_for_backfill(&ratio, &full_ratio)) { - dout(1) << __func__ << ": Canceling backfill, current usage is " - << ratio << ", which exceeds " << full_ratio << dendl; + ostringstream ss; + if (osd->too_full_for_backfill(ss)) { + dout(1) << __func__ << ": Canceling backfill, " << ss.str() << dendl; queue_peering_event( CephPeeringEvtRef( std::make_shared( From 79a4ac41c5fd6613e5b9a51de595a7043425916d Mon Sep 17 00:00:00 2001 From: David Zafman Date: Wed, 29 Mar 2017 17:35:51 -0700 Subject: [PATCH 09/24] common: Remove unused config option osd_recovery_threads Signed-off-by: David Zafman --- doc/rados/configuration/osd-config-ref.rst | 7 ------- doc/rados/operations/monitoring-osd-pg.rst | 3 +-- src/common/config_opts.h | 1 - 3 files changed, 1 insertion(+), 10 deletions(-) diff --git a/doc/rados/configuration/osd-config-ref.rst b/doc/rados/configuration/osd-config-ref.rst index 06e46a6eab9..d68316dff60 100644 --- a/doc/rados/configuration/osd-config-ref.rst +++ b/doc/rados/configuration/osd-config-ref.rst @@ -673,13 +673,6 @@ perform well in a degraded state. :Default: ``8 << 20`` -``osd recovery threads`` - -:Description: The number of threads for recovering data. -:Type: 32-bit Integer -:Default: ``1`` - - ``osd recovery thread timeout`` :Description: The maximum time in seconds before timing out a recovery thread. diff --git a/doc/rados/operations/monitoring-osd-pg.rst b/doc/rados/operations/monitoring-osd-pg.rst index 866ae8313ba..37f1960cba6 100644 --- a/doc/rados/operations/monitoring-osd-pg.rst +++ b/doc/rados/operations/monitoring-osd-pg.rst @@ -468,8 +468,7 @@ Ceph provides a number of settings to balance the resource contention between new service requests and the need to recover data objects and restore the placement groups to the current state. The ``osd recovery delay start`` setting allows an OSD to restart, re-peer and even process some replay requests before -starting the recovery process. The ``osd recovery threads`` setting limits the -number of threads for the recovery process (1 thread by default). The ``osd +starting the recovery process. The ``osd recovery thread timeout`` sets a thread timeout, because multiple OSDs may fail, restart and re-peer at staggered rates. The ``osd recovery max active`` setting limits the number of recovery requests an OSD will entertain simultaneously to diff --git a/src/common/config_opts.h b/src/common/config_opts.h index 974ee7efd33..3de8c42ee89 100644 --- a/src/common/config_opts.h +++ b/src/common/config_opts.h @@ -742,7 +742,6 @@ OPTION(osd_op_pq_min_cost, OPT_U64, 65536) OPTION(osd_disk_threads, OPT_INT, 1) OPTION(osd_disk_thread_ioprio_class, OPT_STR, "") // rt realtime be best effort idle OPTION(osd_disk_thread_ioprio_priority, OPT_INT, -1) // 0-7 -OPTION(osd_recovery_threads, OPT_INT, 1) OPTION(osd_recover_clone_overlap, OPT_BOOL, true) // preserve clone_overlap during recovery/migration OPTION(osd_op_num_threads_per_shard, OPT_INT, 2) OPTION(osd_op_num_shards, OPT_INT, 5) From 9dd695299901174babaffc7043a69c35f5302c66 Mon Sep 17 00:00:00 2001 From: David Zafman Date: Mon, 3 Apr 2017 15:46:37 -0700 Subject: [PATCH 10/24] common: Bump ratio for backfillfull from 85% to 90% Signed-off-by: David Zafman --- src/common/config_opts.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/common/config_opts.h b/src/common/config_opts.h index 3de8c42ee89..a3e74b84860 100644 --- a/src/common/config_opts.h +++ b/src/common/config_opts.h @@ -627,7 +627,7 @@ OPTION(osd_max_backfills, OPT_U64, 1) OPTION(osd_min_recovery_priority, OPT_INT, 0) // Refuse backfills when OSD full ratio is above this value -OPTION(osd_backfill_full_ratio, OPT_FLOAT, 0.85) +OPTION(osd_backfill_full_ratio, OPT_FLOAT, 0.90) // Seconds to wait before retrying refused backfills OPTION(osd_backfill_retry_interval, OPT_DOUBLE, 30.0) From 0264bbddb7616e55e7ea90b6f5a834275ebe8985 Mon Sep 17 00:00:00 2001 From: David Zafman Date: Thu, 30 Mar 2017 11:18:38 -0700 Subject: [PATCH 11/24] osd: For testing full disks add injectfull socket command Signed-off-by: David Zafman --- src/osd/OSD.cc | 33 ++++++++++++++++++++++++++++++++- src/osd/OSD.h | 1 + 2 files changed, 33 insertions(+), 1 deletion(-) diff --git a/src/osd/OSD.cc b/src/osd/OSD.cc index 167c0c1ff3a..8799b7933a3 100644 --- a/src/osd/OSD.cc +++ b/src/osd/OSD.cc @@ -221,6 +221,7 @@ OSDService::OSDService(OSD *osd) : whoami(osd->whoami), store(osd->store), log_client(osd->log_client), clog(osd->clog), pg_recovery_stats(osd->pg_recovery_stats), + injectfull(0), cluster_messenger(osd->cluster_messenger), client_messenger(osd->client_messenger), logger(osd->logger), @@ -753,9 +754,10 @@ void OSDService::check_full_status(const osd_stat_t &osd_stat) } enum s_names new_state; + // If testing with injectfull, let's keep determined state as FAILSAFE if (ratio > failsafe_ratio) { new_state = FAILSAFE; - } else if (ratio > full_ratio) { + } else if (ratio > full_ratio || injectfull) { new_state = FULL; } else if (ratio > nearfull_ratio) { new_state = NEARFULL; @@ -806,6 +808,16 @@ bool OSDService::need_fullness_update() bool OSDService::check_failsafe_full() { Mutex::Locker l(full_status_lock); + + if (injectfull) { + // injectfull is either a count of the number of times to return full + // or if -1 then always return full + if (injectfull > 0) + --injectfull; + dout(5) << __func__ << " Injected full OSD (" << (injectfull < 0 ? "set" : std::to_string(injectfull)) << ")" << dendl; + return true; + } + if (cur_state == FAILSAFE) return true; return false; @@ -826,6 +838,14 @@ bool OSDService::is_full() bool OSDService::too_full_for_backfill(ostream &ss) { Mutex::Locker l(full_status_lock); + if (injectfull) { + // injectfull is either a count of the number of times to return full + // or if -1 then always return full + if (injectfull > 0) + --injectfull; + ss << "Injected full OSD (" << (injectfull < 0 ? string("set") : std::to_string(injectfull)) << ")"; + return true; + } double max_ratio; max_ratio = cct->_conf->osd_backfill_full_ratio; ss << "current usage is " << cur_ratio << ", which is greater than max allowed ratio " << max_ratio; @@ -2629,6 +2649,13 @@ void OSD::final_init() test_ops_hook, "Trigger a scheduled scrub "); assert(r == 0); + r = admin_socket->register_command( + "injectfull", + "injectfull " \ + "name=count,type=CephInt,req=false ", + test_ops_hook, + "Inject a full disk (optional count times)"); + assert(r == 0); } void OSD::create_logger() @@ -4851,6 +4878,10 @@ void TestOpsSocketHook::test_ops(OSDService *service, ObjectStore *store, pg->unlock(); return; } + if (command == "injectfull") { + cmd_getval(service->cct, cmdmap, "count", service->injectfull, (int64_t)-1); + return; + } ss << "Internal error - command=" << command; } diff --git a/src/osd/OSD.h b/src/osd/OSD.h index e593d552a6b..29bacb60879 100644 --- a/src/osd/OSD.h +++ b/src/osd/OSD.h @@ -461,6 +461,7 @@ public: LogClient &log_client; LogChannelRef clog; PGRecoveryStats &pg_recovery_stats; + int64_t injectfull; private: Messenger *&cluster_messenger; Messenger *&client_messenger; From a5731076add0af10686da482ecc29a1fa2600a14 Mon Sep 17 00:00:00 2001 From: David Zafman Date: Thu, 30 Mar 2017 12:42:54 -0700 Subject: [PATCH 12/24] osd: Handle backfillfull_ratio just like nearfull and full Add BACKFILLFULL as a local OSD cur_state Notify monitor of this new fullness state Signed-off-by: David Zafman --- .../osd_internals/recovery_reservation.rst | 6 +-- doc/man/8/ceph.rst | 6 +++ doc/rados/configuration/mon-config-ref.rst | 10 +++++ doc/rados/configuration/osd-config-ref.rst | 9 ---- doc/rados/operations/monitoring-osd-pg.rst | 5 ++- .../troubleshooting/troubleshooting-osd.rst | 10 +++-- qa/tasks/ceph_manager.py | 14 ++++--- qa/workunits/ceph-helpers.sh | 2 +- qa/workunits/cephtool/test.sh | 2 + qa/workunits/rest/test.py | 4 ++ src/common/ceph_strings.cc | 2 + src/common/config_opts.h | 4 +- src/include/rados.h | 1 + src/mon/MonCommands.h | 4 ++ src/mon/OSDMonitor.cc | 30 +++++++++---- src/osd/OSD.cc | 42 ++++++++++++------- src/osd/OSD.h | 4 +- src/osd/OSDMap.cc | 31 +++++++++++--- src/osd/OSDMap.h | 8 +++- src/test/cli/osdmaptool/clobber.t | 2 + src/test/cli/osdmaptool/create-print.t | 1 + src/test/cli/osdmaptool/create-racks.t | 1 + src/test/pybind/test_ceph_argparse.py | 3 ++ src/tools/ceph_monstore_tool.cc | 8 ++++ 24 files changed, 151 insertions(+), 58 deletions(-) diff --git a/doc/dev/osd_internals/recovery_reservation.rst b/doc/dev/osd_internals/recovery_reservation.rst index 24db1387f50..cabea04cc73 100644 --- a/doc/dev/osd_internals/recovery_reservation.rst +++ b/doc/dev/osd_internals/recovery_reservation.rst @@ -34,8 +34,8 @@ the typical process. Once the primary has its local reservation, it requests a remote reservation from the backfill target. This reservation CAN be rejected, -for instance if the OSD is too full (osd_backfill_full_ratio config -option). If the reservation is rejected, the primary drops its local +for instance if the OSD is too full (backfillfull_ratio osd setting). +If the reservation is rejected, the primary drops its local reservation, waits (osd_backfill_retry_interval), and then retries. It will retry indefinitely. @@ -64,7 +64,7 @@ to the monitor. The state chart can set: - recovering: recovering - backfill_wait: waiting for remote backfill reservations - backfilling: backfilling - - backfill_toofull: backfill reservation rejected, OSD too full + - backfill_toofull: backfill stopped, OSD(s) above backfillfull ratio -------- diff --git a/doc/man/8/ceph.rst b/doc/man/8/ceph.rst index f878f882525..b2489126848 100644 --- a/doc/man/8/ceph.rst +++ b/doc/man/8/ceph.rst @@ -1166,6 +1166,12 @@ Usage:: ceph pg set_full_ratio +Subcommand ``set_backfillfull_ratio`` sets ratio at which pgs are considered too full to backfill. + +Usage:: + + ceph pg set_backfillfull_ratio + Subcommand ``set_nearfull_ratio`` sets ratio at which pgs are considered nearly full. diff --git a/doc/rados/configuration/mon-config-ref.rst b/doc/rados/configuration/mon-config-ref.rst index 8c05571c6ce..b19461f7a62 100644 --- a/doc/rados/configuration/mon-config-ref.rst +++ b/doc/rados/configuration/mon-config-ref.rst @@ -400,6 +400,7 @@ a reasonable number for a near full ratio. [global] mon osd full ratio = .80 + mon osd backfillfull ratio = .75 mon osd nearfull ratio = .70 @@ -412,6 +413,15 @@ a reasonable number for a near full ratio. :Default: ``.95`` +``mon osd backfillfull ratio`` + +:Description: The percentage of disk space used before an OSD is + considered too ``full`` to backfill. + +:Type: Float +:Default: ``.90`` + + ``mon osd nearfull ratio`` :Description: The percentage of disk space used before an OSD is diff --git a/doc/rados/configuration/osd-config-ref.rst b/doc/rados/configuration/osd-config-ref.rst index d68316dff60..5679c0caeeb 100644 --- a/doc/rados/configuration/osd-config-ref.rst +++ b/doc/rados/configuration/osd-config-ref.rst @@ -560,15 +560,6 @@ priority than requests to read or write data. :Default: ``512`` -``osd backfill full ratio`` - -:Description: Refuse to accept backfill requests when the Ceph OSD Daemon's - full ratio is above this value. - -:Type: Float -:Default: ``0.85`` - - ``osd backfill retry interval`` :Description: The number of seconds to wait before retrying backfill requests. diff --git a/doc/rados/operations/monitoring-osd-pg.rst b/doc/rados/operations/monitoring-osd-pg.rst index 37f1960cba6..b390b030b71 100644 --- a/doc/rados/operations/monitoring-osd-pg.rst +++ b/doc/rados/operations/monitoring-osd-pg.rst @@ -496,8 +496,9 @@ placement group can't be backfilled, it may be considered ``incomplete``. Ceph provides a number of settings to manage the load spike associated with reassigning placement groups to an OSD (especially a new OSD). By default, ``osd_max_backfills`` sets the maximum number of concurrent backfills to or from -an OSD to 10. The ``osd backfill full ratio`` enables an OSD to refuse a -backfill request if the OSD is approaching its full ratio (85%, by default). +an OSD to 10. The ``backfill full ratio`` enables an OSD to refuse a +backfill request if the OSD is approaching its full ratio (90%, by default) and +change with ``ceph osd set-backfillfull-ratio`` comand. If an OSD refuses a backfill request, the ``osd backfill retry interval`` enables an OSD to retry the request (after 10 seconds, by default). OSDs can also set ``osd backfill scan min`` and ``osd backfill scan max`` to manage scan diff --git a/doc/rados/troubleshooting/troubleshooting-osd.rst b/doc/rados/troubleshooting/troubleshooting-osd.rst index 3661f8af45d..651907dfb05 100644 --- a/doc/rados/troubleshooting/troubleshooting-osd.rst +++ b/doc/rados/troubleshooting/troubleshooting-osd.rst @@ -206,7 +206,9 @@ Ceph prevents you from writing to a full OSD so that you don't lose data. In an operational cluster, you should receive a warning when your cluster is getting near its full ratio. The ``mon osd full ratio`` defaults to ``0.95``, or 95% of capacity before it stops clients from writing data. -The ``mon osd nearfull ratio`` defaults to ``0.85``, or 85% of capacity +The ``mon osd backfillfull ratio`` defaults to ``0.90``, or 90 % of +capacity when it blocks backfills from starting. The +``mon osd nearfull ratio`` defaults to ``0.85``, or 85% of capacity when it generates a health warning. Full cluster issues usually arise when testing how Ceph handles an OSD @@ -214,7 +216,8 @@ failure on a small cluster. When one node has a high percentage of the cluster's data, the cluster can easily eclipse its nearfull and full ratio immediately. If you are testing how Ceph reacts to OSD failures on a small cluster, you should leave ample free disk space and consider temporarily -lowering the ``mon osd full ratio`` and ``mon osd nearfull ratio``. +lowering the ``mon osd full ratio``, ``mon osd backfillfull ratio`` and +``mon osd nearfull ratio``. Full ``ceph-osds`` will be reported by ``ceph health``:: @@ -225,9 +228,10 @@ Full ``ceph-osds`` will be reported by ``ceph health``:: Or:: ceph health - HEALTH_ERR 1 nearfull osds, 1 full osds + HEALTH_ERR 1 nearfull osds, 1 backfillfull osds, 1 full osds osd.2 is near full at 85% osd.3 is full at 97% + osd.4 is backfill full at 91% The best way to deal with a full cluster is to add new ``ceph-osds``, allowing the cluster to redistribute data to the newly available storage. diff --git a/qa/tasks/ceph_manager.py b/qa/tasks/ceph_manager.py index 8ff2556a7a0..1a9aff93c3c 100644 --- a/qa/tasks/ceph_manager.py +++ b/qa/tasks/ceph_manager.py @@ -696,7 +696,7 @@ class Thrasher: """ Test backfills stopping when the replica fills up. - First, use osd_backfill_full_ratio to simulate a now full + First, use injectfull admin command to simulate a now full osd by setting it to 0 on all of the OSDs. Second, on a random subset, set @@ -705,13 +705,14 @@ class Thrasher: Then, verify that all backfills stop. """ - self.log("injecting osd_backfill_full_ratio = 0") + self.log("injecting backfill full") for i in self.live_osds: self.ceph_manager.set_config( i, osd_debug_skip_full_check_in_backfill_reservation= - random.choice(['false', 'true']), - osd_backfill_full_ratio=0) + random.choice(['false', 'true'])) + self.ceph_manager.osd_admin_socket(i, command=['injectfull', 'backfillfull'], + check_status=True, timeout=30, stdout=DEVNULL) for i in range(30): status = self.ceph_manager.compile_pg_status() if 'backfill' not in status.keys(): @@ -724,8 +725,9 @@ class Thrasher: for i in self.live_osds: self.ceph_manager.set_config( i, - osd_debug_skip_full_check_in_backfill_reservation='false', - osd_backfill_full_ratio=0.85) + osd_debug_skip_full_check_in_backfill_reservation='false') + self.ceph_manager.osd_admin_socket(i, command=['injectfull', 'none'], + check_status=True, timeout=30, stdout=DEVNULL) def test_map_discontinuity(self): """ diff --git a/qa/workunits/ceph-helpers.sh b/qa/workunits/ceph-helpers.sh index 9863668de75..8642d376a73 100755 --- a/qa/workunits/ceph-helpers.sh +++ b/qa/workunits/ceph-helpers.sh @@ -400,6 +400,7 @@ EOF if test -z "$(get_config mon $id mon_initial_members)" ; then ceph osd pool delete rbd rbd --yes-i-really-really-mean-it || return 1 ceph osd pool create rbd $PG_NUM || return 1 + ceph osd set-backfillfull-ratio .99 fi } @@ -634,7 +635,6 @@ function activate_osd() { ceph_disk_args+=" --prepend-to-path=" local ceph_args="$CEPH_ARGS" - ceph_args+=" --osd-backfill-full-ratio=.99" ceph_args+=" --osd-failsafe-full-ratio=.99" ceph_args+=" --osd-journal-size=100" ceph_args+=" --osd-scrub-load-threshold=2000" diff --git a/qa/workunits/cephtool/test.sh b/qa/workunits/cephtool/test.sh index 8d0ce1baeff..a6c7b617cc6 100755 --- a/qa/workunits/cephtool/test.sh +++ b/qa/workunits/cephtool/test.sh @@ -1419,6 +1419,8 @@ function test_mon_pg() ceph osd set-full-ratio .962 ceph osd dump | grep '^full_ratio 0.962' + ceph osd set-backfillfull-ratio .912 + ceph osd dump | grep '^backfillfull_ratio 0.912' ceph osd set-nearfull-ratio .892 ceph osd dump | grep '^nearfull_ratio 0.892' diff --git a/qa/workunits/rest/test.py b/qa/workunits/rest/test.py index 1a7ab30b886..7bbb6f3ccae 100755 --- a/qa/workunits/rest/test.py +++ b/qa/workunits/rest/test.py @@ -359,6 +359,10 @@ if __name__ == '__main__': r = expect('osd/dump', 'GET', 200, 'json', JSONHDR) assert(float(r.myjson['output']['full_ratio']) == 0.90) expect('osd/set-full-ratio?ratio=0.95', 'PUT', 200, '') + expect('osd/set-backfillfull-ratio?ratio=0.88', 'PUT', 200, '') + r = expect('osd/dump', 'GET', 200, 'json', JSONHDR) + assert(float(r.myjson['output']['backfillfull_ratio']) == 0.88) + expect('osd/set-backfillfull-ratio?ratio=0.90', 'PUT', 200, '') expect('osd/set-nearfull-ratio?ratio=0.90', 'PUT', 200, '') r = expect('osd/dump', 'GET', 200, 'json', JSONHDR) assert(float(r.myjson['output']['nearfull_ratio']) == 0.90) diff --git a/src/common/ceph_strings.cc b/src/common/ceph_strings.cc index 462dd6db249..1fec2f7b0a1 100644 --- a/src/common/ceph_strings.cc +++ b/src/common/ceph_strings.cc @@ -42,6 +42,8 @@ const char *ceph_osd_state_name(int s) return "full"; case CEPH_OSD_NEARFULL: return "nearfull"; + case CEPH_OSD_BACKFILLFULL: + return "backfillfull"; default: return "???"; } diff --git a/src/common/config_opts.h b/src/common/config_opts.h index a3e74b84860..9eed4d485e2 100644 --- a/src/common/config_opts.h +++ b/src/common/config_opts.h @@ -308,6 +308,7 @@ OPTION(mon_pg_warn_min_pool_objects, OPT_INT, 1000) // do not warn on pools bel OPTION(mon_pg_check_down_all_threshold, OPT_FLOAT, .5) // threshold of down osds after which we check all pgs OPTION(mon_cache_target_full_warn_ratio, OPT_FLOAT, .66) // position between pool cache_target_full and max where we start warning OPTION(mon_osd_full_ratio, OPT_FLOAT, .95) // what % full makes an OSD "full" +OPTION(mon_osd_backfillfull_ratio, OPT_FLOAT, .90) // what % full makes an OSD backfill full (backfill halted) OPTION(mon_osd_nearfull_ratio, OPT_FLOAT, .85) // what % full makes an OSD near full OPTION(mon_allow_pool_delete, OPT_BOOL, false) // allow pool deletion OPTION(mon_globalid_prealloc, OPT_U32, 10000) // how many globalids to prealloc @@ -626,9 +627,6 @@ OPTION(osd_max_backfills, OPT_U64, 1) // Minimum recovery priority (255 = max, smaller = lower) OPTION(osd_min_recovery_priority, OPT_INT, 0) -// Refuse backfills when OSD full ratio is above this value -OPTION(osd_backfill_full_ratio, OPT_FLOAT, 0.90) - // Seconds to wait before retrying refused backfills OPTION(osd_backfill_retry_interval, OPT_DOUBLE, 30.0) diff --git a/src/include/rados.h b/src/include/rados.h index c8bc8ac4c0c..986b42969c4 100644 --- a/src/include/rados.h +++ b/src/include/rados.h @@ -116,6 +116,7 @@ struct ceph_eversion { #define CEPH_OSD_NEW (1<<3) /* osd is new, never marked in */ #define CEPH_OSD_FULL (1<<4) /* osd is at or above full threshold */ #define CEPH_OSD_NEARFULL (1<<5) /* osd is at or above nearfull threshold */ +#define CEPH_OSD_BACKFILLFULL (1<<6) /* osd is at or above backfillfull threshold */ extern const char *ceph_osd_state_name(int s); diff --git a/src/mon/MonCommands.h b/src/mon/MonCommands.h index 4e816890820..d1b09e66024 100644 --- a/src/mon/MonCommands.h +++ b/src/mon/MonCommands.h @@ -592,6 +592,10 @@ COMMAND("osd set-full-ratio " \ "name=ratio,type=CephFloat,range=0.0|1.0", \ "set usage ratio at which OSDs are marked full", "osd", "rw", "cli,rest") +COMMAND("osd set-backfillfull-ratio " \ + "name=ratio,type=CephFloat,range=0.0|1.0", \ + "set usage ratio at which OSDs are marked too full to backfill", + "osd", "rw", "cli,rest") COMMAND("osd set-nearfull-ratio " \ "name=ratio,type=CephFloat,range=0.0|1.0", \ "set usage ratio at which OSDs are marked near-full", diff --git a/src/mon/OSDMonitor.cc b/src/mon/OSDMonitor.cc index 7daca9c887d..3566c3f7330 100644 --- a/src/mon/OSDMonitor.cc +++ b/src/mon/OSDMonitor.cc @@ -164,6 +164,7 @@ void OSDMonitor::create_initial() if (!g_conf->mon_debug_no_require_luminous) { newmap.set_flag(CEPH_OSDMAP_REQUIRE_LUMINOUS); newmap.full_ratio = g_conf->mon_osd_full_ratio; + newmap.backfillfull_ratio = g_conf->mon_osd_backfillfull_ratio; newmap.nearfull_ratio = g_conf->mon_osd_nearfull_ratio; } @@ -784,8 +785,15 @@ void OSDMonitor::create_pending() OSDMap::clean_temps(g_ceph_context, osdmap, &pending_inc); dout(10) << "create_pending did clean_temps" << dendl; + // On upgrade OSDMap has new field set by mon_osd_backfillfull_ratio config + // instead of osd_backfill_full_ratio config + if (osdmap.backfillfull_ratio <= 0) { + dout(1) << __func__ << " setting backfillfull_ratio = " + << g_conf->mon_osd_backfillfull_ratio << dendl; + pending_inc.new_backfillfull_ratio = g_conf->mon_osd_backfillfull_ratio; + } if (!osdmap.test_flag(CEPH_OSDMAP_REQUIRE_LUMINOUS)) { - // transition nearfull ratios from PGMap to OSDMap (on upgrade) + // transition full ratios from PGMap to OSDMap (on upgrade) PGMap *pg_map = &mon->pgmon()->pg_map; if (osdmap.full_ratio != pg_map->full_ratio) { dout(10) << __func__ << " full_ratio " << osdmap.full_ratio @@ -1048,8 +1056,8 @@ void OSDMonitor::encode_pending(MonitorDBStore::TransactionRef t) tmp.apply_incremental(pending_inc); if (tmp.test_flag(CEPH_OSDMAP_REQUIRE_LUMINOUS)) { - int full, nearfull; - tmp.count_full_nearfull_osds(&full, &nearfull); + int full, backfill, nearfull; + tmp.count_full_nearfull_osds(&full, &backfill, &nearfull); if (full > 0) { if (!tmp.test_flag(CEPH_OSDMAP_FULL)) { dout(10) << __func__ << " setting full flag" << dendl; @@ -2287,7 +2295,7 @@ bool OSDMonitor::preprocess_full(MonOpRequestRef op) MOSDFull *m = static_cast(op->get_req()); int from = m->get_orig_source().num(); set state; - unsigned mask = CEPH_OSD_NEARFULL | CEPH_OSD_FULL; + unsigned mask = CEPH_OSD_NEARFULL | CEPH_OSD_BACKFILLFULL | CEPH_OSD_FULL; // check permissions, ignore if failed MonSession *session = m->get_session(); @@ -2337,7 +2345,7 @@ bool OSDMonitor::prepare_full(MonOpRequestRef op) const MOSDFull *m = static_cast(op->get_req()); const int from = m->get_orig_source().num(); - const unsigned mask = CEPH_OSD_NEARFULL | CEPH_OSD_FULL; + const unsigned mask = CEPH_OSD_NEARFULL | CEPH_OSD_BACKFILLFULL | CEPH_OSD_FULL; const unsigned want_state = m->state & mask; // safety first unsigned cur_state = osdmap.get_state(from); @@ -3342,13 +3350,18 @@ void OSDMonitor::get_health(list >& summary, } if (osdmap.test_flag(CEPH_OSDMAP_REQUIRE_LUMINOUS)) { - int full, nearfull; - osdmap.count_full_nearfull_osds(&full, &nearfull); + int full, backfill, nearfull; + osdmap.count_full_nearfull_osds(&full, &backfill, &nearfull); if (full > 0) { ostringstream ss; ss << full << " full osd(s)"; summary.push_back(make_pair(HEALTH_ERR, ss.str())); } + if (backfill > 0) { + ostringstream ss; + ss << backfill << " backfillfull osd(s)"; + summary.push_back(make_pair(HEALTH_WARN, ss.str())); + } if (nearfull > 0) { ostringstream ss; ss << nearfull << " nearfull osd(s)"; @@ -6929,6 +6942,7 @@ bool OSDMonitor::prepare_command_impl(MonOpRequestRef op, return true; } else if (prefix == "osd set-full-ratio" || + prefix == "osd set-backfillfull-ratio" || prefix == "osd set-nearfull-ratio") { if (!osdmap.test_flag(CEPH_OSDMAP_REQUIRE_LUMINOUS)) { ss << "you must complete the upgrade and set require_luminous_osds before" @@ -6945,6 +6959,8 @@ bool OSDMonitor::prepare_command_impl(MonOpRequestRef op, } if (prefix == "osd set-full-ratio") pending_inc.new_full_ratio = n; + else if (prefix == "osd set-backfillfull-ratio") + pending_inc.new_backfillfull_ratio = n; else if (prefix == "osd set-nearfull-ratio") pending_inc.new_nearfull_ratio = n; ss << prefix << " " << n; diff --git a/src/osd/OSD.cc b/src/osd/OSD.cc index 8799b7933a3..1368c9d654e 100644 --- a/src/osd/OSD.cc +++ b/src/osd/OSD.cc @@ -736,20 +736,24 @@ void OSDService::check_full_status(const osd_stat_t &osd_stat) return; } float nearfull_ratio = osdmap->get_nearfull_ratio(); - float full_ratio = std::max(osdmap->get_full_ratio(), nearfull_ratio); + float backfillfull_ratio = std::max(osdmap->get_backfillfull_ratio(), nearfull_ratio); + float full_ratio = std::max(osdmap->get_full_ratio(), backfillfull_ratio); float failsafe_ratio = std::max(get_failsafe_full_ratio(), full_ratio); if (!osdmap->test_flag(CEPH_OSDMAP_REQUIRE_LUMINOUS)) { // use the failsafe for nearfull and full; the mon isn't using the // flags anyway because we're mid-upgrade. full_ratio = failsafe_ratio; + backfillfull_ratio = failsafe_ratio; nearfull_ratio = failsafe_ratio; } else if (full_ratio <= 0 || + backfillfull_ratio <= 0 || nearfull_ratio <= 0) { - derr << __func__ << " full_ratio or nearfull_ratio is <= 0" << dendl; + derr << __func__ << " full_ratio, backfillfull_ratio or nearfull_ratio is <= 0" << dendl; // use failsafe flag. ick. the monitor did something wrong or the user // did something stupid. full_ratio = failsafe_ratio; + backfillfull_ratio = failsafe_ratio; nearfull_ratio = failsafe_ratio; } @@ -759,6 +763,8 @@ void OSDService::check_full_status(const osd_stat_t &osd_stat) new_state = FAILSAFE; } else if (ratio > full_ratio || injectfull) { new_state = FULL; + } else if (ratio > backfillfull_ratio) { + new_state = BACKFILLFULL; } else if (ratio > nearfull_ratio) { new_state = NEARFULL; } else { @@ -766,6 +772,7 @@ void OSDService::check_full_status(const osd_stat_t &osd_stat) } dout(20) << __func__ << " cur ratio " << ratio << ". nearfull_ratio " << nearfull_ratio + << ". backfillfull_ratio " << backfillfull_ratio << ", full_ratio " << full_ratio << ", failsafe_ratio " << failsafe_ratio << ", new state " << get_full_state_name(new_state) @@ -793,6 +800,8 @@ bool OSDService::need_fullness_update() if (osdmap->exists(whoami)) { if (osdmap->get_state(whoami) & CEPH_OSD_FULL) { cur = FULL; + } else if (osdmap->get_state(whoami) & CEPH_OSD_BACKFILLFULL) { + cur = BACKFILLFULL; } else if (osdmap->get_state(whoami) & CEPH_OSD_NEARFULL) { cur = NEARFULL; } @@ -800,6 +809,8 @@ bool OSDService::need_fullness_update() s_names want = NONE; if (is_full()) want = FULL; + else if (is_backfillfull()) + want = BACKFILLFULL; else if (is_nearfull()) want = NEARFULL; return want != cur; @@ -818,15 +829,19 @@ bool OSDService::check_failsafe_full() return true; } - if (cur_state == FAILSAFE) - return true; - return false; + return cur_state == FAILSAFE; } bool OSDService::is_nearfull() { Mutex::Locker l(full_status_lock); - return cur_state == NEARFULL; + return cur_state >= NEARFULL; +} + +bool OSDService::is_backfillfull() +{ + Mutex::Locker l(full_status_lock); + return cur_state >= BACKFILLFULL; } bool OSDService::is_full() @@ -838,18 +853,11 @@ bool OSDService::is_full() bool OSDService::too_full_for_backfill(ostream &ss) { Mutex::Locker l(full_status_lock); - if (injectfull) { - // injectfull is either a count of the number of times to return full - // or if -1 then always return full - if (injectfull > 0) - --injectfull; - ss << "Injected full OSD (" << (injectfull < 0 ? string("set") : std::to_string(injectfull)) << ")"; + if (cur_state >= BACKFILLFULL) { + ss << "current usage is " << cur_ratio << ", which is greater than max allowed ratio"; return true; } - double max_ratio; - max_ratio = cct->_conf->osd_backfill_full_ratio; - ss << "current usage is " << cur_ratio << ", which is greater than max allowed ratio " << max_ratio; - return cur_ratio >= max_ratio; + return false; } void OSDService::update_osd_stat(vector& hb_peers) @@ -5213,6 +5221,8 @@ void OSD::send_full_update() unsigned state = 0; if (service.is_full()) { state = CEPH_OSD_FULL; + } else if (service.is_backfillfull()) { + state = CEPH_OSD_BACKFILLFULL; } else if (service.is_nearfull()) { state = CEPH_OSD_NEARFULL; } diff --git a/src/osd/OSD.h b/src/osd/OSD.h index 29bacb60879..19a1ac7a2df 100644 --- a/src/osd/OSD.h +++ b/src/osd/OSD.h @@ -1139,11 +1139,12 @@ public: // -- OSD Full Status -- private: Mutex full_status_lock; - enum s_names { NONE, NEARFULL, FULL, FAILSAFE } cur_state; // ascending + enum s_names { NONE, NEARFULL, BACKFILLFULL, FULL, FAILSAFE } cur_state; // ascending const char *get_full_state_name(s_names s) { switch (s) { case NONE: return "none"; case NEARFULL: return "nearfull"; + case BACKFILLFULL: return "backfillfull"; case FULL: return "full"; case FAILSAFE: return "failsafe"; default: return "???"; @@ -1155,6 +1156,7 @@ private: public: bool check_failsafe_full(); bool is_nearfull(); + bool is_backfillfull(); bool is_full(); bool too_full_for_backfill(ostream &ss); bool need_fullness_update(); ///< osdmap state needs update diff --git a/src/osd/OSDMap.cc b/src/osd/OSDMap.cc index c4e388f86cf..6d0cbfe0a28 100644 --- a/src/osd/OSDMap.cc +++ b/src/osd/OSDMap.cc @@ -450,7 +450,7 @@ void OSDMap::Incremental::encode(bufferlist& bl, uint64_t features) const } { - uint8_t target_v = 3; + uint8_t target_v = 4; if (!HAVE_FEATURE(features, SERVER_LUMINOUS)) { target_v = 2; } @@ -470,6 +470,7 @@ void OSDMap::Incremental::encode(bufferlist& bl, uint64_t features) const if (target_v >= 3) { ::encode(new_nearfull_ratio, bl); ::encode(new_full_ratio, bl); + ::encode(new_backfillfull_ratio, bl); } ENCODE_FINISH(bl); // osd-only data } @@ -654,7 +655,7 @@ void OSDMap::Incremental::decode(bufferlist::iterator& bl) } { - DECODE_START(3, bl); // extended, osd-only data + DECODE_START(4, bl); // extended, osd-only data ::decode(new_hb_back_up, bl); ::decode(new_up_thru, bl); ::decode(new_last_clean_interval, bl); @@ -677,6 +678,11 @@ void OSDMap::Incremental::decode(bufferlist::iterator& bl) new_nearfull_ratio = -1; new_full_ratio = -1; } + if (struct_v >= 4) { + ::decode(new_backfillfull_ratio, bl); + } else { + new_backfillfull_ratio = -1; + } DECODE_FINISH(bl); // osd-only data } @@ -720,6 +726,7 @@ void OSDMap::Incremental::dump(Formatter *f) const f->dump_int("new_flags", new_flags); f->dump_float("new_full_ratio", new_full_ratio); f->dump_float("new_nearfull_ratio", new_nearfull_ratio); + f->dump_float("new_backfillfull_ratio", new_backfillfull_ratio); if (fullmap.length()) { f->open_object_section("full_map"); @@ -1022,14 +1029,17 @@ int OSDMap::calc_num_osds() return num_osd; } -void OSDMap::count_full_nearfull_osds(int *full, int *nearfull) const +void OSDMap::count_full_nearfull_osds(int *full, int *backfill, int *nearfull) const { *full = 0; + *backfill = 0; *nearfull = 0; for (int i = 0; i < max_osd; ++i) { if (exists(i) && is_up(i) && is_in(i)) { if (osd_state[i] & CEPH_OSD_FULL) ++(*full); + else if (osd_state[i] & CEPH_OSD_BACKFILLFULL) + ++(*backfill); else if (osd_state[i] & CEPH_OSD_NEARFULL) ++(*nearfull); } @@ -1575,6 +1585,9 @@ int OSDMap::apply_incremental(const Incremental &inc) if (inc.new_nearfull_ratio >= 0) { nearfull_ratio = inc.new_nearfull_ratio; } + if (inc.new_backfillfull_ratio >= 0) { + backfillfull_ratio = inc.new_backfillfull_ratio; + } if (inc.new_full_ratio >= 0) { full_ratio = inc.new_full_ratio; } @@ -2148,7 +2161,7 @@ void OSDMap::encode(bufferlist& bl, uint64_t features) const } { - uint8_t target_v = 2; + uint8_t target_v = 3; if (!HAVE_FEATURE(features, SERVER_LUMINOUS)) { target_v = 1; } @@ -2173,6 +2186,7 @@ void OSDMap::encode(bufferlist& bl, uint64_t features) const if (target_v >= 2) { ::encode(nearfull_ratio, bl); ::encode(full_ratio, bl); + ::encode(backfillfull_ratio, bl); } ENCODE_FINISH(bl); // osd-only data } @@ -2390,7 +2404,7 @@ void OSDMap::decode(bufferlist::iterator& bl) } { - DECODE_START(2, bl); // extended, osd-only data + DECODE_START(3, bl); // extended, osd-only data ::decode(osd_addrs->hb_back_addr, bl); ::decode(osd_info, bl); ::decode(blacklist, bl); @@ -2407,6 +2421,11 @@ void OSDMap::decode(bufferlist::iterator& bl) nearfull_ratio = 0; full_ratio = 0; } + if (struct_v >= 3) { + ::decode(backfillfull_ratio, bl); + } else { + backfillfull_ratio = 0; + } DECODE_FINISH(bl); // osd-only data } @@ -2480,6 +2499,7 @@ void OSDMap::dump(Formatter *f) const f->dump_stream("modified") << get_modified(); f->dump_string("flags", get_flag_string()); f->dump_float("full_ratio", full_ratio); + f->dump_float("backfillfull_ratio", backfillfull_ratio); f->dump_float("nearfull_ratio", nearfull_ratio); f->dump_string("cluster_snapshot", get_cluster_snapshot()); f->dump_int("pool_max", get_pool_max()); @@ -2701,6 +2721,7 @@ void OSDMap::print(ostream& out) const out << "flags " << get_flag_string() << "\n"; out << "full_ratio " << full_ratio << "\n"; + out << "backfillfull_ratio " << backfillfull_ratio << "\n"; out << "nearfull_ratio " << nearfull_ratio << "\n"; if (get_cluster_snapshot().length()) out << "cluster_snapshot " << get_cluster_snapshot() << "\n"; diff --git a/src/osd/OSDMap.h b/src/osd/OSDMap.h index eb0399edda6..2e8fcf800d9 100644 --- a/src/osd/OSDMap.h +++ b/src/osd/OSDMap.h @@ -155,6 +155,7 @@ public: string cluster_snapshot; float new_nearfull_ratio = -1; + float new_backfillfull_ratio = -1; float new_full_ratio = -1; mutable bool have_crc; ///< crc values are defined @@ -254,7 +255,7 @@ private: string cluster_snapshot; bool new_blacklist_entries; - float full_ratio = 0, nearfull_ratio = 0; + float full_ratio = 0, backfillfull_ratio = 0, nearfull_ratio = 0; mutable uint64_t cached_up_osd_features; @@ -336,10 +337,13 @@ public: float get_full_ratio() const { return full_ratio; } + float get_backfillfull_ratio() const { + return backfillfull_ratio; + } float get_nearfull_ratio() const { return nearfull_ratio; } - void count_full_nearfull_osds(int *full, int *nearfull) const; + void count_full_nearfull_osds(int *full, int *backfill, int *nearfull) const; /***** cluster state *****/ /* osds */ diff --git a/src/test/cli/osdmaptool/clobber.t b/src/test/cli/osdmaptool/clobber.t index 275fefcc737..dd7e1756104 100644 --- a/src/test/cli/osdmaptool/clobber.t +++ b/src/test/cli/osdmaptool/clobber.t @@ -20,6 +20,7 @@ modified \d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}\.\d+ (re) flags full_ratio 0 + backfillfull_ratio 0 nearfull_ratio 0 pool 0 'rbd' replicated size 3 min_size 2 crush_ruleset 0 object_hash rjenkins pg_num 192 pgp_num 192 last_change 0 flags hashpspool stripe_width 0 @@ -43,6 +44,7 @@ modified \d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}\.\d+ (re) flags full_ratio 0 + backfillfull_ratio 0 nearfull_ratio 0 pool 0 'rbd' replicated size 3 min_size 2 crush_ruleset 0 object_hash rjenkins pg_num 64 pgp_num 64 last_change 0 flags hashpspool stripe_width 0 diff --git a/src/test/cli/osdmaptool/create-print.t b/src/test/cli/osdmaptool/create-print.t index e619f7206e9..32468a4a6fa 100644 --- a/src/test/cli/osdmaptool/create-print.t +++ b/src/test/cli/osdmaptool/create-print.t @@ -77,6 +77,7 @@ modified \d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}\.\d+ (re) flags full_ratio 0 + backfillfull_ratio 0 nearfull_ratio 0 pool 0 'rbd' replicated size 3 min_size 2 crush_ruleset 0 object_hash rjenkins pg_num 192 pgp_num 192 last_change 0 flags hashpspool stripe_width 0 diff --git a/src/test/cli/osdmaptool/create-racks.t b/src/test/cli/osdmaptool/create-racks.t index 19006986f68..0759698127d 100644 --- a/src/test/cli/osdmaptool/create-racks.t +++ b/src/test/cli/osdmaptool/create-racks.t @@ -790,6 +790,7 @@ modified \d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}\.\d+ (re) flags full_ratio 0 + backfillfull_ratio 0 nearfull_ratio 0 pool 0 'rbd' replicated size 3 min_size 2 crush_ruleset 0 object_hash rjenkins pg_num 15296 pgp_num 15296 last_change 0 flags hashpspool stripe_width 0 diff --git a/src/test/pybind/test_ceph_argparse.py b/src/test/pybind/test_ceph_argparse.py index e9694064bd2..0c9cc7524c5 100755 --- a/src/test/pybind/test_ceph_argparse.py +++ b/src/test/pybind/test_ceph_argparse.py @@ -1150,6 +1150,9 @@ class TestOSD(TestArgparse): def test_set_full_ratio(self): self.set_ratio('set-full-ratio') + def test_set_backfillfull_ratio(self): + self.set_ratio('set-backfillfull-ratio') + def test_set_nearfull_ratio(self): self.set_ratio('set-nearfull-ratio') diff --git a/src/tools/ceph_monstore_tool.cc b/src/tools/ceph_monstore_tool.cc index 874a4f0583f..8c941443d81 100644 --- a/src/tools/ceph_monstore_tool.cc +++ b/src/tools/ceph_monstore_tool.cc @@ -654,6 +654,14 @@ static int update_pgmap_meta(MonitorDBStore& st) ::encode(full_ratio, bl); t->put(prefix, "full_ratio", bl); } + { + auto backfillfull_ratio = g_ceph_context->_conf->mon_osd_backfillfull_ratio; + if (backfillfull_ratio > 1.0) + backfillfull_ratio /= 100.0; + bufferlist bl; + ::encode(backfillfull_ratio, bl); + t->put(prefix, "backfillfull_ratio", bl); + } { auto nearfull_ratio = g_ceph_context->_conf->mon_osd_nearfull_ratio; if (nearfull_ratio > 1.0) From 1e2fde1012fe9df1ce114256cd72d9da902a01ab Mon Sep 17 00:00:00 2001 From: David Zafman Date: Fri, 31 Mar 2017 14:13:14 -0700 Subject: [PATCH 13/24] osd: Revamp injectfull op to support all full states Use check_* for injectable full checks Use is_* to just test simple cur_state Signed-off-by: David Zafman --- src/osd/OSD.cc | 101 ++++++++++++++++++++++++++++------------ src/osd/OSD.h | 39 ++++++++++++---- src/osd/PG.cc | 8 ++-- src/osd/PrimaryLogPG.cc | 6 ++- 4 files changed, 110 insertions(+), 44 deletions(-) diff --git a/src/osd/OSD.cc b/src/osd/OSD.cc index 1368c9d654e..644c9b959c8 100644 --- a/src/osd/OSD.cc +++ b/src/osd/OSD.cc @@ -221,7 +221,6 @@ OSDService::OSDService(OSD *osd) : whoami(osd->whoami), store(osd->store), log_client(osd->log_client), clog(osd->clog), pg_recovery_stats(osd->pg_recovery_stats), - injectfull(0), cluster_messenger(osd->cluster_messenger), client_messenger(osd->client_messenger), logger(osd->logger), @@ -757,11 +756,14 @@ void OSDService::check_full_status(const osd_stat_t &osd_stat) nearfull_ratio = failsafe_ratio; } - enum s_names new_state; - // If testing with injectfull, let's keep determined state as FAILSAFE - if (ratio > failsafe_ratio) { + string inject; + s_names new_state; + if (injectfull_state > NONE && injectfull) { + new_state = injectfull_state; + inject = "(Injected)"; + } else if (ratio > failsafe_ratio) { new_state = FAILSAFE; - } else if (ratio > full_ratio || injectfull) { + } else if (ratio > full_ratio) { new_state = FULL; } else if (ratio > backfillfull_ratio) { new_state = BACKFILLFULL; @@ -776,6 +778,7 @@ void OSDService::check_full_status(const osd_stat_t &osd_stat) << ", full_ratio " << full_ratio << ", failsafe_ratio " << failsafe_ratio << ", new state " << get_full_state_name(new_state) + << " " << inject << dendl; // warn @@ -816,48 +819,73 @@ bool OSDService::need_fullness_update() return want != cur; } -bool OSDService::check_failsafe_full() +bool OSDService::_check_full(s_names type, ostream &ss) const { Mutex::Locker l(full_status_lock); - if (injectfull) { - // injectfull is either a count of the number of times to return full + if (injectfull && injectfull_state >= type) { + // injectfull is either a count of the number of times to return failsafe full // or if -1 then always return full if (injectfull > 0) --injectfull; - dout(5) << __func__ << " Injected full OSD (" << (injectfull < 0 ? "set" : std::to_string(injectfull)) << ")" << dendl; + ss << "Injected " << get_full_state_name(type) << " OSD (" + << (injectfull < 0 ? "set" : std::to_string(injectfull)) << ")"; return true; } + ss << "current usage is " << cur_ratio; + return cur_state >= type; +} + +bool OSDService::check_failsafe_full(ostream &ss) const +{ + return _check_full(FAILSAFE, ss); +} + +bool OSDService::check_full(ostream &ss) const +{ + return _check_full(FULL, ss); +} + +bool OSDService::check_backfill_full(ostream &ss) const +{ + return _check_full(BACKFILLFULL, ss); +} + +bool OSDService::check_nearfull(ostream &ss) const +{ + return _check_full(NEARFULL, ss); +} + +bool OSDService::is_failsafe_full() const +{ + Mutex::Locker l(full_status_lock); return cur_state == FAILSAFE; } -bool OSDService::is_nearfull() -{ - Mutex::Locker l(full_status_lock); - return cur_state >= NEARFULL; -} - -bool OSDService::is_backfillfull() -{ - Mutex::Locker l(full_status_lock); - return cur_state >= BACKFILLFULL; -} - -bool OSDService::is_full() +bool OSDService::is_full() const { Mutex::Locker l(full_status_lock); return cur_state >= FULL; } -bool OSDService::too_full_for_backfill(ostream &ss) +bool OSDService::is_backfillfull() const { Mutex::Locker l(full_status_lock); - if (cur_state >= BACKFILLFULL) { - ss << "current usage is " << cur_ratio << ", which is greater than max allowed ratio"; - return true; - } - return false; + return cur_state >= BACKFILLFULL; +} + +bool OSDService::is_nearfull() const +{ + Mutex::Locker l(full_status_lock); + return cur_state >= NEARFULL; +} + +void OSDService::set_injectfull(s_names type, int64_t count) +{ + Mutex::Locker l(full_status_lock); + injectfull_state = type; + injectfull = count; } void OSDService::update_osd_stat(vector& hb_peers) @@ -2660,6 +2688,7 @@ void OSD::final_init() r = admin_socket->register_command( "injectfull", "injectfull " \ + "name=type,type=CephString,req=false " \ "name=count,type=CephInt,req=false ", test_ops_hook, "Inject a full disk (optional count times)"); @@ -4887,7 +4916,21 @@ void TestOpsSocketHook::test_ops(OSDService *service, ObjectStore *store, return; } if (command == "injectfull") { - cmd_getval(service->cct, cmdmap, "count", service->injectfull, (int64_t)-1); + int64_t count; + string type; + OSDService::s_names state; + cmd_getval(service->cct, cmdmap, "type", type, string("full")); + cmd_getval(service->cct, cmdmap, "count", count, (int64_t)-1); + if (type == "none" || count == 0) { + type = "none"; + count = 0; + } + state = service->get_full_state(type); + if (state == OSDService::s_names::INVALID) { + ss << "Invalid type use (none, nearfull, backfillfull, full, failsafe)"; + return; + } + service->set_injectfull(state, count); return; } ss << "Internal error - command=" << command; diff --git a/src/osd/OSD.h b/src/osd/OSD.h index 19a1ac7a2df..45f76415f49 100644 --- a/src/osd/OSD.h +++ b/src/osd/OSD.h @@ -461,7 +461,6 @@ public: LogClient &log_client; LogChannelRef clog; PGRecoveryStats &pg_recovery_stats; - int64_t injectfull; private: Messenger *&cluster_messenger; Messenger *&client_messenger; @@ -1138,9 +1137,10 @@ public: // -- OSD Full Status -- private: - Mutex full_status_lock; - enum s_names { NONE, NEARFULL, BACKFILLFULL, FULL, FAILSAFE } cur_state; // ascending - const char *get_full_state_name(s_names s) { + friend TestOpsSocketHook; + mutable Mutex full_status_lock; + enum s_names { INVALID = -1, NONE, NEARFULL, BACKFILLFULL, FULL, FAILSAFE } cur_state; // ascending + const char *get_full_state_name(s_names s) const { switch (s) { case NONE: return "none"; case NEARFULL: return "nearfull"; @@ -1150,16 +1150,37 @@ private: default: return "???"; } } + s_names get_full_state(string type) const { + if (type == "none") + return NONE; + else if (type == "failsafe") + return FAILSAFE; + else if (type == "full") + return FULL; + else if (type == "backfillfull") + return BACKFILLFULL; + else if (type == "nearfull") + return NEARFULL; + else + return INVALID; + } double cur_ratio; ///< current utilization + mutable int64_t injectfull = 0; + s_names injectfull_state = NONE; float get_failsafe_full_ratio(); void check_full_status(const osd_stat_t &stat); + bool _check_full(s_names type, ostream &ss) const; public: - bool check_failsafe_full(); - bool is_nearfull(); - bool is_backfillfull(); - bool is_full(); - bool too_full_for_backfill(ostream &ss); + bool check_failsafe_full(ostream &ss) const; + bool check_full(ostream &ss) const; + bool check_backfill_full(ostream &ss) const; + bool check_nearfull(ostream &ss) const; + bool is_failsafe_full() const; + bool is_full() const; + bool is_backfillfull() const; + bool is_nearfull() const; bool need_fullness_update(); ///< osdmap state needs update + void set_injectfull(s_names type, int64_t count); // -- epochs -- diff --git a/src/osd/PG.cc b/src/osd/PG.cc index 7cee9141820..d8d9c26dd14 100644 --- a/src/osd/PG.cc +++ b/src/osd/PG.cc @@ -6561,8 +6561,8 @@ PG::RecoveryState::RepNotRecovering::react(const RequestBackfillPrio &evt) ldout(pg->cct, 10) << "backfill reservation rejected: failure injection" << dendl; post_event(RemoteReservationRejected()); - } else if (pg->osd->too_full_for_backfill(ss) && - !pg->cct->_conf->osd_debug_skip_full_check_in_backfill_reservation) { + } else if (!pg->cct->_conf->osd_debug_skip_full_check_in_backfill_reservation && + pg->osd->check_backfill_full(ss)) { ldout(pg->cct, 10) << "backfill reservation rejected: " << ss.str() << dendl; post_event(RemoteReservationRejected()); @@ -6597,8 +6597,8 @@ PG::RecoveryState::RepWaitBackfillReserved::react(const RemoteBackfillReserved & pg->osd->remote_reserver.cancel_reservation(pg->info.pgid); post_event(RemoteReservationRejected()); return discard_event(); - } else if (pg->osd->too_full_for_backfill(ss) && - !pg->cct->_conf->osd_debug_skip_full_check_in_backfill_reservation) { + } else if (!pg->cct->_conf->osd_debug_skip_full_check_in_backfill_reservation && + pg->osd->check_backfill_full(ss)) { ldout(pg->cct, 10) << "backfill reservation rejected after reservation: " << ss.str() << dendl; pg->osd->remote_reserver.cancel_reservation(pg->info.pgid); diff --git a/src/osd/PrimaryLogPG.cc b/src/osd/PrimaryLogPG.cc index 553ffca191e..5a73ca5e982 100644 --- a/src/osd/PrimaryLogPG.cc +++ b/src/osd/PrimaryLogPG.cc @@ -1891,8 +1891,10 @@ void PrimaryLogPG::do_op(OpRequestRef& op) // mds should have stopped writing before this point. // We can't allow OSD to become non-startable even if mds // could be writing as part of file removals. - if (write_ordered && osd->check_failsafe_full()) { + ostringstream ss; + if (write_ordered && osd->check_failsafe_full(ss)) { dout(10) << __func__ << " fail-safe full check failed, dropping request" + << ss.str() << dendl; return; } @@ -3332,7 +3334,7 @@ void PrimaryLogPG::do_scan( case MOSDPGScan::OP_SCAN_GET_DIGEST: { ostringstream ss; - if (osd->too_full_for_backfill(ss)) { + if (osd->check_backfill_full(ss)) { dout(1) << __func__ << ": Canceling backfill, " << ss.str() << dendl; queue_peering_event( CephPeeringEvtRef( From 1711ccdec779d32c4b128c1e3f6d10bd78abcb1c Mon Sep 17 00:00:00 2001 From: David Zafman Date: Mon, 3 Apr 2017 16:20:35 -0700 Subject: [PATCH 14/24] osd: Check failsafe full and crash on push/pull Signed-off-by: David Zafman --- src/osd/ECBackend.cc | 5 +++++ src/osd/PGBackend.h | 2 ++ src/osd/PrimaryLogPG.cc | 4 ++++ src/osd/PrimaryLogPG.h | 1 + src/osd/ReplicatedBackend.cc | 12 ++++++++++++ 5 files changed, 24 insertions(+) diff --git a/src/osd/ECBackend.cc b/src/osd/ECBackend.cc index 8e7b09fe38a..ea285cc4e6f 100644 --- a/src/osd/ECBackend.cc +++ b/src/osd/ECBackend.cc @@ -282,6 +282,11 @@ void ECBackend::handle_recovery_push( const PushOp &op, RecoveryMessages *m) { + ostringstream ss; + if (get_parent()->check_failsafe_full(ss)) { + dout(10) << __func__ << " Out of space (failsafe) processing push request: " << ss.str() << dendl; + ceph_abort(); + } bool oneshot = op.before_progress.first && op.after_progress.data_complete; ghobject_t tobj; diff --git a/src/osd/PGBackend.h b/src/osd/PGBackend.h index 763e02bc2b8..ddd0612b131 100644 --- a/src/osd/PGBackend.h +++ b/src/osd/PGBackend.h @@ -261,6 +261,8 @@ typedef ceph::shared_ptr OSDMapRef; virtual LogClientTemp clog_error() = 0; + virtual bool check_failsafe_full(ostream &ss) = 0; + virtual ~Listener() {} }; Listener *parent; diff --git a/src/osd/PrimaryLogPG.cc b/src/osd/PrimaryLogPG.cc index 5a73ca5e982..9398783d867 100644 --- a/src/osd/PrimaryLogPG.cc +++ b/src/osd/PrimaryLogPG.cc @@ -13272,6 +13272,10 @@ int PrimaryLogPG::getattrs_maybe_cache( return r; } +bool PrimaryLogPG::check_failsafe_full(ostream &ss) { + return osd->check_failsafe_full(ss); +} + void intrusive_ptr_add_ref(PrimaryLogPG *pg) { pg->get("intptr"); } void intrusive_ptr_release(PrimaryLogPG *pg) { pg->put("intptr"); } diff --git a/src/osd/PrimaryLogPG.h b/src/osd/PrimaryLogPG.h index 3d92e3b968c..f5b025db0e4 100644 --- a/src/osd/PrimaryLogPG.h +++ b/src/osd/PrimaryLogPG.h @@ -1731,6 +1731,7 @@ public: void on_flushed() override; void on_removal(ObjectStore::Transaction *t) override; void on_shutdown() override; + bool check_failsafe_full(ostream &ss) override; // attr cache handling void setattr_maybe_cache( diff --git a/src/osd/ReplicatedBackend.cc b/src/osd/ReplicatedBackend.cc index b897eddf815..1364cc62770 100644 --- a/src/osd/ReplicatedBackend.cc +++ b/src/osd/ReplicatedBackend.cc @@ -807,6 +807,11 @@ void ReplicatedBackend::_do_push(OpRequestRef op) vector replies; ObjectStore::Transaction t; + ostringstream ss; + if (get_parent()->check_failsafe_full(ss)) { + dout(10) << __func__ << " Out of space (failsafe) processing push request: " << ss.str() << dendl; + ceph_abort(); + } for (vector::const_iterator i = m->pushes.begin(); i != m->pushes.end(); ++i) { @@ -862,6 +867,13 @@ void ReplicatedBackend::_do_pull_response(OpRequestRef op) op->mark_started(); vector replies(1); + + ostringstream ss; + if (get_parent()->check_failsafe_full(ss)) { + dout(10) << __func__ << " Out of space (failsafe) processing pull response (push): " << ss.str() << dendl; + ceph_abort(); + } + ObjectStore::Transaction t; list to_continue; for (vector::const_iterator i = m->pushes.begin(); From 94e253ce37d6e840b47513c103c871f05111c2c6 Mon Sep 17 00:00:00 2001 From: David Zafman Date: Thu, 16 Mar 2017 10:30:57 -0700 Subject: [PATCH 15/24] osd: Rename backfill_request_* to recovery_request_* To be used by both recovery and backfill Signed-off-by: David Zafman --- src/osd/OSD.cc | 10 +++++----- src/osd/OSD.h | 6 +++--- src/osd/PG.cc | 4 ++-- 3 files changed, 10 insertions(+), 10 deletions(-) diff --git a/src/osd/OSD.cc b/src/osd/OSD.cc index 644c9b959c8..2bc8cce857e 100644 --- a/src/osd/OSD.cc +++ b/src/osd/OSD.cc @@ -255,8 +255,8 @@ OSDService::OSDService(OSD *osd) : watch_lock("OSDService::watch_lock"), watch_timer(osd->client_messenger->cct, watch_lock), next_notif_id(0), - backfill_request_lock("OSDService::backfill_request_lock"), - backfill_request_timer(cct, backfill_request_lock, false), + recovery_request_lock("OSDService::recovery_request_lock"), + recovery_request_timer(cct, recovery_request_lock, false), reserver_finisher(cct), local_reserver(&reserver_finisher, cct->_conf->osd_max_backfills, cct->_conf->osd_min_recovery_priority), @@ -495,8 +495,8 @@ void OSDService::shutdown() objecter_finisher.stop(); { - Mutex::Locker l(backfill_request_lock); - backfill_request_timer.shutdown(); + Mutex::Locker l(recovery_request_lock); + recovery_request_timer.shutdown(); } { @@ -2200,7 +2200,7 @@ int OSD::init() tick_timer.init(); tick_timer_without_osd_lock.init(); - service.backfill_request_timer.init(); + service.recovery_request_timer.init(); // mount. dout(2) << "mounting " << dev_path << " " diff --git a/src/osd/OSD.h b/src/osd/OSD.h index 45f76415f49..591e5d2d2c7 100644 --- a/src/osd/OSD.h +++ b/src/osd/OSD.h @@ -917,9 +917,9 @@ public: return (((uint64_t)cur_epoch) << 32) | ((uint64_t)(next_notif_id++)); } - // -- Backfill Request Scheduling -- - Mutex backfill_request_lock; - SafeTimer backfill_request_timer; + // -- Recovery/Backfill Request Scheduling -- + Mutex recovery_request_lock; + SafeTimer recovery_request_timer; // -- tids -- // for ops i issue diff --git a/src/osd/PG.cc b/src/osd/PG.cc index d8d9c26dd14..e7e21c61302 100644 --- a/src/osd/PG.cc +++ b/src/osd/PG.cc @@ -3809,8 +3809,8 @@ void PG::reject_reservation() void PG::schedule_backfill_full_retry() { - Mutex::Locker lock(osd->backfill_request_lock); - osd->backfill_request_timer.add_event_after( + Mutex::Locker lock(osd->recovery_request_lock); + osd->recovery_request_timer.add_event_after( cct->_conf->osd_backfill_retry_interval, new QueuePeeringEvt( this, get_osdmap()->get_epoch(), From c7e8dcad3414651e44c6b543e41f05f05c946d51 Mon Sep 17 00:00:00 2001 From: David Zafman Date: Thu, 16 Mar 2017 08:05:58 -0700 Subject: [PATCH 16/24] osd: Add check_osdmap_full() to check for shard OSD fullness Signed-off-by: David Zafman --- src/osd/OSD.cc | 10 ++++++++++ src/osd/OSD.h | 1 + src/osd/PGBackend.h | 2 ++ src/osd/PrimaryLogPG.cc | 5 +++++ src/osd/PrimaryLogPG.h | 1 + 5 files changed, 19 insertions(+) diff --git a/src/osd/OSD.cc b/src/osd/OSD.cc index 2bc8cce857e..2b2aa97beb0 100644 --- a/src/osd/OSD.cc +++ b/src/osd/OSD.cc @@ -921,6 +921,16 @@ void OSDService::update_osd_stat(vector& hb_peers) check_full_status(osd_stat); } +bool OSDService::check_osdmap_full(const set &missing_on) +{ + OSDMapRef osdmap = get_osdmap(); + for (auto shard : missing_on) { + if (osdmap->get_state(shard.osd) & CEPH_OSD_FULL) + return true; + } + return false; +} + void OSDService::send_message_osd_cluster(int peer, Message *m, epoch_t from_epoch) { OSDMapRef next_map = get_nextmap_reserved(); diff --git a/src/osd/OSD.h b/src/osd/OSD.h index 591e5d2d2c7..9429640a9b5 100644 --- a/src/osd/OSD.h +++ b/src/osd/OSD.h @@ -1181,6 +1181,7 @@ public: bool is_nearfull() const; bool need_fullness_update(); ///< osdmap state needs update void set_injectfull(s_names type, int64_t count); + bool check_osdmap_full(const set &missing_on); // -- epochs -- diff --git a/src/osd/PGBackend.h b/src/osd/PGBackend.h index ddd0612b131..66bb890af01 100644 --- a/src/osd/PGBackend.h +++ b/src/osd/PGBackend.h @@ -263,6 +263,8 @@ typedef ceph::shared_ptr OSDMapRef; virtual bool check_failsafe_full(ostream &ss) = 0; + virtual bool check_osdmap_full(const set &missing_on) = 0; + virtual ~Listener() {} }; Listener *parent; diff --git a/src/osd/PrimaryLogPG.cc b/src/osd/PrimaryLogPG.cc index 9398783d867..71846caabf8 100644 --- a/src/osd/PrimaryLogPG.cc +++ b/src/osd/PrimaryLogPG.cc @@ -13031,6 +13031,11 @@ void PrimaryLogPG::_scrub_finish() } } +bool PrimaryLogPG::check_osdmap_full(const set &missing_on) +{ + return osd->check_osdmap_full(missing_on); +} + /*---SnapTrimmer Logging---*/ #undef dout_prefix #define dout_prefix *_dout << pg->gen_prefix() diff --git a/src/osd/PrimaryLogPG.h b/src/osd/PrimaryLogPG.h index f5b025db0e4..a7e812fe8da 100644 --- a/src/osd/PrimaryLogPG.h +++ b/src/osd/PrimaryLogPG.h @@ -1732,6 +1732,7 @@ public: void on_removal(ObjectStore::Transaction *t) override; void on_shutdown() override; bool check_failsafe_full(ostream &ss) override; + bool check_osdmap_full(const set &missing_on) override; // attr cache handling void setattr_maybe_cache( From 27e14504f6c4b7a8e06b038fc396c40e8e6c1456 Mon Sep 17 00:00:00 2001 From: David Zafman Date: Wed, 5 Apr 2017 14:09:18 -0700 Subject: [PATCH 17/24] osd: Add PG state and flag for too full for recovery New state machine state NotRecovering New PG state PG_STATE_RECOVERY_TOOFULL Signed-off-by: David Zafman --- .../osd_internals/recovery_reservation.rst | 1 + src/common/config_opts.h | 3 ++ src/mon/PGMonitor.cc | 3 ++ src/osd/OSD.cc | 1 + src/osd/OSD.h | 1 + src/osd/PG.cc | 31 +++++++++++++++++++ src/osd/PG.h | 9 ++++++ src/osd/osd_types.cc | 4 +++ src/osd/osd_types.h | 1 + 9 files changed, 54 insertions(+) diff --git a/doc/dev/osd_internals/recovery_reservation.rst b/doc/dev/osd_internals/recovery_reservation.rst index cabea04cc73..4ab03192fe5 100644 --- a/doc/dev/osd_internals/recovery_reservation.rst +++ b/doc/dev/osd_internals/recovery_reservation.rst @@ -62,6 +62,7 @@ to the monitor. The state chart can set: - recovery_wait: waiting for local/remote reservations - recovering: recovering + - recovery_toofull: recovery stopped, OSD(s) above full ratio - backfill_wait: waiting for remote backfill reservations - backfilling: backfilling - backfill_toofull: backfill stopped, OSD(s) above backfillfull ratio diff --git a/src/common/config_opts.h b/src/common/config_opts.h index 9eed4d485e2..c1630a44c01 100644 --- a/src/common/config_opts.h +++ b/src/common/config_opts.h @@ -630,6 +630,9 @@ OPTION(osd_min_recovery_priority, OPT_INT, 0) // Seconds to wait before retrying refused backfills OPTION(osd_backfill_retry_interval, OPT_DOUBLE, 30.0) +// Seconds to wait before retrying refused recovery +OPTION(osd_recovery_retry_interval, OPT_DOUBLE, 30.0) + // max agent flush ops OPTION(osd_agent_max_ops, OPT_INT, 4) OPTION(osd_agent_max_low_ops, OPT_INT, 2) diff --git a/src/mon/PGMonitor.cc b/src/mon/PGMonitor.cc index 6669ffcd4b3..477dabce4e0 100644 --- a/src/mon/PGMonitor.cc +++ b/src/mon/PGMonitor.cc @@ -1316,6 +1316,8 @@ void PGMonitor::get_health(list >& summary, note["backfilling"] += p->second; if (p->first & PG_STATE_BACKFILL_TOOFULL) note["backfill_toofull"] += p->second; + if (p->first & PG_STATE_RECOVERY_TOOFULL) + note["recovery_toofull"] += p->second; } ceph::unordered_map stuck_pgs; @@ -1403,6 +1405,7 @@ void PGMonitor::get_health(list >& summary, PG_STATE_REPAIR | PG_STATE_RECOVERING | PG_STATE_RECOVERY_WAIT | + PG_STATE_RECOVERY_TOOFULL | PG_STATE_INCOMPLETE | PG_STATE_BACKFILL_WAIT | PG_STATE_BACKFILL | diff --git a/src/osd/OSD.cc b/src/osd/OSD.cc index 2b2aa97beb0..e39330e0923 100644 --- a/src/osd/OSD.cc +++ b/src/osd/OSD.cc @@ -2910,6 +2910,7 @@ void OSD::create_recoverystate_perf() rs_perf.add_time_avg(rs_down_latency, "down_latency", "Down recovery state latency"); rs_perf.add_time_avg(rs_getmissing_latency, "getmissing_latency", "Getmissing recovery state latency"); rs_perf.add_time_avg(rs_waitupthru_latency, "waitupthru_latency", "Waitupthru recovery state latency"); + rs_perf.add_time_avg(rs_notrecovering_latency, "notrecovering_latency", "Notrecovering recovery state latency"); recoverystate_perf = rs_perf.create_perf_counters(); cct->get_perfcounters_collection()->add(recoverystate_perf); diff --git a/src/osd/OSD.h b/src/osd/OSD.h index 9429640a9b5..f6afebd4df0 100644 --- a/src/osd/OSD.h +++ b/src/osd/OSD.h @@ -202,6 +202,7 @@ enum { rs_down_latency, rs_getmissing_latency, rs_waitupthru_latency, + rs_notrecovering_latency, rs_last, }; diff --git a/src/osd/PG.cc b/src/osd/PG.cc index e7e21c61302..576ec836dc5 100644 --- a/src/osd/PG.cc +++ b/src/osd/PG.cc @@ -3817,6 +3817,16 @@ void PG::schedule_backfill_full_retry() RequestBackfill())); } +void PG::schedule_recovery_full_retry() +{ + Mutex::Locker lock(osd->recovery_request_lock); + osd->recovery_request_timer.add_event_after( + cct->_conf->osd_recovery_retry_interval, + new QueuePeeringEvt( + this, get_osdmap()->get_epoch(), + DoRecovery())); +} + void PG::clear_scrub_reserved() { scrubber.reserved_peers.clear(); @@ -5237,6 +5247,7 @@ void PG::start_peering_interval( state_clear(PG_STATE_PEERED); state_clear(PG_STATE_DOWN); state_clear(PG_STATE_RECOVERY_WAIT); + state_clear(PG_STATE_RECOVERY_TOOFULL); state_clear(PG_STATE_RECOVERING); peer_purged.clear(); @@ -6488,6 +6499,24 @@ void PG::RecoveryState::NotBackfilling::exit() pg->osd->recoverystate_perf->tinc(rs_notbackfilling_latency, dur); } +/*----NotRecovering------*/ +PG::RecoveryState::NotRecovering::NotRecovering(my_context ctx) + : my_base(ctx), + NamedState(context< RecoveryMachine >().pg->cct, "Started/Primary/Active/NotRecovering") +{ + context< RecoveryMachine >().log_enter(state_name); + PG *pg = context< RecoveryMachine >().pg; + pg->publish_stats_to_osd(); +} + +void PG::RecoveryState::NotRecovering::exit() +{ + context< RecoveryMachine >().log_exit(state_name, enter_time); + PG *pg = context< RecoveryMachine >().pg; + utime_t dur = ceph_clock_now() - enter_time; + pg->osd->recoverystate_perf->tinc(rs_notrecovering_latency, dur); +} + /*---RepNotRecovering----*/ PG::RecoveryState::RepNotRecovering::RepNotRecovering(my_context ctx) : my_base(ctx), @@ -6737,6 +6766,7 @@ PG::RecoveryState::Recovering::Recovering(my_context ctx) PG *pg = context< RecoveryMachine >().pg; pg->state_clear(PG_STATE_RECOVERY_WAIT); + pg->state_clear(PG_STATE_RECOVERY_TOOFULL); pg->state_set(PG_STATE_RECOVERING); pg->publish_stats_to_osd(); pg->queue_recovery(); @@ -7185,6 +7215,7 @@ void PG::RecoveryState::Active::exit() pg->state_clear(PG_STATE_BACKFILL_TOOFULL); pg->state_clear(PG_STATE_BACKFILL_WAIT); pg->state_clear(PG_STATE_RECOVERY_WAIT); + pg->state_clear(PG_STATE_RECOVERY_TOOFULL); utime_t dur = ceph_clock_now() - enter_time; pg->osd->recoverystate_perf->tinc(rs_active_latency, dur); pg->agent_stop(); diff --git a/src/osd/PG.h b/src/osd/PG.h index 4763859d66b..6ac48fd6cbf 100644 --- a/src/osd/PG.h +++ b/src/osd/PG.h @@ -1340,6 +1340,7 @@ public: void reject_reservation(); void schedule_backfill_full_retry(); + void schedule_recovery_full_retry(); // -- recovery state -- @@ -1850,6 +1851,14 @@ public: boost::statechart::result react(const RemoteReservationRejected& evt); }; + struct NotRecovering : boost::statechart::state< NotRecovering, Active>, NamedState { + typedef boost::mpl::list< + boost::statechart::transition< DoRecovery, WaitLocalRecoveryReserved > + > reactions; + explicit NotRecovering(my_context ctx); + void exit(); + }; + struct RepNotRecovering; struct ReplicaActive : boost::statechart::state< ReplicaActive, Started, RepNotRecovering >, NamedState { explicit ReplicaActive(my_context ctx); diff --git a/src/osd/osd_types.cc b/src/osd/osd_types.cc index 2cce17e029f..145a0d76831 100644 --- a/src/osd/osd_types.cc +++ b/src/osd/osd_types.cc @@ -789,6 +789,8 @@ std::string pg_state_string(int state) oss << "clean+"; if (state & PG_STATE_RECOVERY_WAIT) oss << "recovery_wait+"; + if (state & PG_STATE_RECOVERY_TOOFULL) + oss << "recovery_toofull+"; if (state & PG_STATE_RECOVERING) oss << "recovering+"; if (state & PG_STATE_DOWN) @@ -869,6 +871,8 @@ int pg_string_state(const std::string& state) type = PG_STATE_BACKFILL_TOOFULL; else if (state == "recovery_wait") type = PG_STATE_RECOVERY_WAIT; + else if (state == "recovery_toofull") + type = PG_STATE_RECOVERY_TOOFULL; else if (state == "undersized") type = PG_STATE_UNDERSIZED; else if (state == "activating") diff --git a/src/osd/osd_types.h b/src/osd/osd_types.h index 0d2297582f5..1c4e4c65a6c 100644 --- a/src/osd/osd_types.h +++ b/src/osd/osd_types.h @@ -971,6 +971,7 @@ inline ostream& operator<<(ostream& out, const osd_stat_t& s) { #define PG_STATE_PEERED (1<<25) // peered, cannot go active, can recover #define PG_STATE_SNAPTRIM (1<<26) // trimming snaps #define PG_STATE_SNAPTRIM_WAIT (1<<27) // queued to trim snaps +#define PG_STATE_RECOVERY_TOOFULL (1<<28) // recovery can't proceed: too full std::string pg_state_string(int state); std::string pg_vector_string(const vector &a); From 84088568b5bf4fb3fa48ebf3e157d288f9f29eed Mon Sep 17 00:00:00 2001 From: David Zafman Date: Wed, 5 Apr 2017 14:12:43 -0700 Subject: [PATCH 18/24] osd: Check whether any OSD is full before starting recovery Add event RecoveryTooFull to move to NotRecovering state Signed-off-by: David Zafman --- src/common/config_opts.h | 1 + src/osd/PG.cc | 18 ++++++++++++++++++ src/osd/PG.h | 5 ++++- 3 files changed, 23 insertions(+), 1 deletion(-) diff --git a/src/common/config_opts.h b/src/common/config_opts.h index c1630a44c01..c742ae6fa20 100644 --- a/src/common/config_opts.h +++ b/src/common/config_opts.h @@ -871,6 +871,7 @@ OPTION(osd_debug_skip_full_check_in_backfill_reservation, OPT_BOOL, false) OPTION(osd_debug_reject_backfill_probability, OPT_DOUBLE, 0) OPTION(osd_debug_inject_copyfrom_error, OPT_BOOL, false) // inject failure during copyfrom completion OPTION(osd_debug_misdirected_ops, OPT_BOOL, false) +OPTION(osd_debug_skip_full_check_in_recovery, OPT_BOOL, false) OPTION(osd_enxio_on_misdirected_op, OPT_BOOL, false) OPTION(osd_debug_verify_cached_snaps, OPT_BOOL, false) OPTION(osd_enable_op_tracker, OPT_BOOL, true) // enable/disable OSD op tracking diff --git a/src/osd/PG.cc b/src/osd/PG.cc index 576ec836dc5..c5a35dbaa4a 100644 --- a/src/osd/PG.cc +++ b/src/osd/PG.cc @@ -6700,6 +6700,15 @@ PG::RecoveryState::WaitLocalRecoveryReserved::WaitLocalRecoveryReserved(my_conte { context< RecoveryMachine >().log_enter(state_name); PG *pg = context< RecoveryMachine >().pg; + + // Make sure all nodes that part of the recovery aren't full + if (!pg->cct->_conf->osd_debug_skip_full_check_in_recovery && + pg->osd->check_osdmap_full(pg->actingbackfill)) { + post_event(RecoveryTooFull()); + return; + } + + pg->state_clear(PG_STATE_RECOVERY_TOOFULL); pg->state_set(PG_STATE_RECOVERY_WAIT); pg->osd->local_reserver.request_reservation( pg->info.pgid, @@ -6710,6 +6719,15 @@ PG::RecoveryState::WaitLocalRecoveryReserved::WaitLocalRecoveryReserved(my_conte pg->publish_stats_to_osd(); } +boost::statechart::result +PG::RecoveryState::WaitLocalRecoveryReserved::react(const RecoveryTooFull &evt) +{ + PG *pg = context< RecoveryMachine >().pg; + pg->state_set(PG_STATE_RECOVERY_TOOFULL); + pg->schedule_recovery_full_retry(); + return transit(); +} + void PG::RecoveryState::WaitLocalRecoveryReserved::exit() { context< RecoveryMachine >().log_exit(state_name, enter_time); diff --git a/src/osd/PG.h b/src/osd/PG.h index 6ac48fd6cbf..b2168b3ee64 100644 --- a/src/osd/PG.h +++ b/src/osd/PG.h @@ -1506,6 +1506,7 @@ public: TrivialEvent(RequestRecovery) TrivialEvent(RecoveryDone) TrivialEvent(BackfillTooFull) + TrivialEvent(RecoveryTooFull) TrivialEvent(AllReplicasRecovered) TrivialEvent(DoRecovery) @@ -1947,10 +1948,12 @@ public: struct WaitLocalRecoveryReserved : boost::statechart::state< WaitLocalRecoveryReserved, Active >, NamedState { typedef boost::mpl::list < - boost::statechart::transition< LocalRecoveryReserved, WaitRemoteRecoveryReserved > + boost::statechart::transition< LocalRecoveryReserved, WaitRemoteRecoveryReserved >, + boost::statechart::custom_reaction< RecoveryTooFull > > reactions; explicit WaitLocalRecoveryReserved(my_context ctx); void exit(); + boost::statechart::result react(const RecoveryTooFull &evt); }; struct Activating : boost::statechart::state< Activating, Active >, NamedState { From 1fafec217599b65d902e21efd43049646e1e145c Mon Sep 17 00:00:00 2001 From: David Zafman Date: Tue, 11 Apr 2017 22:04:07 -0700 Subject: [PATCH 19/24] osd: check_full_status() remove bogus comment and use equivalent computation We actually compute kb_used as the kb - kb_avail. We don't have the statfs() system call issue of non-privileged f_bavail vs f_bfree. It was assumed that used was really like (blocks - f_bfree). It is not. Signed-off-by: David Zafman --- src/osd/OSD.cc | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/src/osd/OSD.cc b/src/osd/OSD.cc index e39330e0923..c8031826f4e 100644 --- a/src/osd/OSD.cc +++ b/src/osd/OSD.cc @@ -716,13 +716,7 @@ void OSDService::check_full_status(const osd_stat_t &osd_stat) { Mutex::Locker l(full_status_lock); - // We base ratio on kb_avail rather than kb_used because they can - // differ significantly e.g. on btrfs volumes with a large number of - // chunks reserved for metadata, and for our purposes (avoiding - // completely filling the disk) it's far more important to know how - // much space is available to use than how much we've already used. - float ratio = ((float)(osd_stat.kb - osd_stat.kb_avail)) / - ((float)osd_stat.kb); + float ratio = ((float)osd_stat.kb_used) / ((float)osd_stat.kb); cur_ratio = ratio; // The OSDMap ratios take precendence. So if the failsafe is .95 and From afd739bed6fb56358c147af32293bff7cc2995a4 Mon Sep 17 00:00:00 2001 From: David Zafman Date: Thu, 13 Apr 2017 11:41:18 -0700 Subject: [PATCH 20/24] mon: Use currently configure full ratio to determine available space This is a bug that would not adjust available space based on the currently configured full ratio, but rather the mon_osd_full_ratio default initial value. Signed-off-by: David Zafman --- src/mon/PGMap.cc | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/src/mon/PGMap.cc b/src/mon/PGMap.cc index ee263d3f792..68f4879a538 100644 --- a/src/mon/PGMap.cc +++ b/src/mon/PGMap.cc @@ -1878,6 +1878,17 @@ int64_t PGMap::get_rule_avail(const OSDMap& osdmap, int ruleno) const return 0; } + float fratio; + if (osdmap.test_flag(CEPH_OSDMAP_REQUIRE_LUMINOUS) && osdmap.get_full_ratio() > 0) { + fratio = osdmap.get_full_ratio(); + } else if (full_ratio > 0) { + fratio = full_ratio; + } else { + // this shouldn't really happen + fratio = g_conf->mon_osd_full_ratio; + if (fratio > 1.0) fratio /= 100; + } + int64_t min = -1; for (map::iterator p = wm.begin(); p != wm.end(); ++p) { ceph::unordered_map::const_iterator osd_info = @@ -1892,7 +1903,7 @@ int64_t PGMap::get_rule_avail(const OSDMap& osdmap, int ruleno) const continue; } double unusable = (double)osd_info->second.kb * - (1.0 - g_conf->mon_osd_full_ratio); + (1.0 - fratio); double avail = MAX(0.0, (double)osd_info->second.kb_avail - unusable); avail *= 1024.0; int64_t proj = (int64_t)(avail / (double)p->second); From c83f11de002a136890281634bc30715434468475 Mon Sep 17 00:00:00 2001 From: David Zafman Date: Thu, 13 Apr 2017 11:43:18 -0700 Subject: [PATCH 21/24] mon: Always fix-up full ratios when specified incorrectly in config Signed-off-by: David Zafman --- src/mon/OSDMonitor.cc | 21 +++++++++++++++------ 1 file changed, 15 insertions(+), 6 deletions(-) diff --git a/src/mon/OSDMonitor.cc b/src/mon/OSDMonitor.cc index 3566c3f7330..de9fb514259 100644 --- a/src/mon/OSDMonitor.cc +++ b/src/mon/OSDMonitor.cc @@ -164,8 +164,11 @@ void OSDMonitor::create_initial() if (!g_conf->mon_debug_no_require_luminous) { newmap.set_flag(CEPH_OSDMAP_REQUIRE_LUMINOUS); newmap.full_ratio = g_conf->mon_osd_full_ratio; + if (newmap.full_ratio > 1.0) newmap.full_ratio /= 100; newmap.backfillfull_ratio = g_conf->mon_osd_backfillfull_ratio; + if (newmap.backfillfull_ratio > 1.0) newmap.backfillfull_ratio /= 100; newmap.nearfull_ratio = g_conf->mon_osd_nearfull_ratio; + if (newmap.nearfull_ratio > 1.0) newmap.nearfull_ratio /= 100; } // encode into pending incremental @@ -788,9 +791,11 @@ void OSDMonitor::create_pending() // On upgrade OSDMap has new field set by mon_osd_backfillfull_ratio config // instead of osd_backfill_full_ratio config if (osdmap.backfillfull_ratio <= 0) { - dout(1) << __func__ << " setting backfillfull_ratio = " - << g_conf->mon_osd_backfillfull_ratio << dendl; pending_inc.new_backfillfull_ratio = g_conf->mon_osd_backfillfull_ratio; + if (pending_inc.new_backfillfull_ratio > 1.0) + pending_inc.new_backfillfull_ratio /= 100; + dout(1) << __func__ << " setting backfillfull_ratio = " + << pending_inc.new_backfillfull_ratio << dendl; } if (!osdmap.test_flag(CEPH_OSDMAP_REQUIRE_LUMINOUS)) { // transition full ratios from PGMap to OSDMap (on upgrade) @@ -808,14 +813,18 @@ void OSDMonitor::create_pending() } else { // safety check (this shouldn't really happen) if (osdmap.full_ratio <= 0) { - dout(1) << __func__ << " setting full_ratio = " - << g_conf->mon_osd_full_ratio << dendl; pending_inc.new_full_ratio = g_conf->mon_osd_full_ratio; + if (pending_inc.new_full_ratio > 1.0) + pending_inc.new_full_ratio /= 100; + dout(1) << __func__ << " setting full_ratio = " + << pending_inc.new_full_ratio << dendl; } if (osdmap.nearfull_ratio <= 0) { - dout(1) << __func__ << " setting nearfull_ratio = " - << g_conf->mon_osd_nearfull_ratio << dendl; pending_inc.new_nearfull_ratio = g_conf->mon_osd_nearfull_ratio; + if (pending_inc.new_nearfull_ratio > 1.0) + pending_inc.new_nearfull_ratio /= 100; + dout(1) << __func__ << " setting nearfull_ratio = " + << pending_inc.new_nearfull_ratio << dendl; } } } From e4cf10d3d89e3b7adcf2b2dcdc02e020b8a793b3 Mon Sep 17 00:00:00 2001 From: David Zafman Date: Thu, 13 Apr 2017 16:20:27 -0700 Subject: [PATCH 22/24] mon: Issue warning or error if a full ratio out of order The full ratios should be in this order: nearfull, backfillfull, full, failsafe full Signed-off-by: David Zafman --- src/mon/OSDMonitor.cc | 43 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 43 insertions(+) diff --git a/src/mon/OSDMonitor.cc b/src/mon/OSDMonitor.cc index de9fb514259..680e9e3a6fe 100644 --- a/src/mon/OSDMonitor.cc +++ b/src/mon/OSDMonitor.cc @@ -3359,6 +3359,49 @@ void OSDMonitor::get_health(list >& summary, } if (osdmap.test_flag(CEPH_OSDMAP_REQUIRE_LUMINOUS)) { + // An osd could configure failsafe ratio, to something different + // but for now assume it is the same here. + float fsr = g_conf->osd_failsafe_full_ratio; + if (fsr > 1.0) fsr /= 100; + float fr = osdmap.get_full_ratio(); + float br = osdmap.get_backfillfull_ratio(); + float nr = osdmap.get_nearfull_ratio(); + + bool out_of_order = false; + // These checks correspond to how OSDService::check_full_status() in an OSD + // handles the improper setting of these values. + if (br < nr) { + out_of_order = true; + if (detail) { + ostringstream ss; + ss << "backfill_ratio (" << br << ") < nearfull_ratio (" << nr << "), increased"; + detail->push_back(make_pair(HEALTH_ERR, ss.str())); + } + br = nr; + } + if (fr < br) { + out_of_order = true; + if (detail) { + ostringstream ss; + ss << "full_ratio (" << fr << ") < backfillfull_ratio (" << br << "), increased"; + detail->push_back(make_pair(HEALTH_ERR, ss.str())); + } + fr = br; + } + if (fsr < fr) { + out_of_order = true; + if (detail) { + ostringstream ss; + ss << "osd_failsafe_full_ratio (" << fsr << ") < full_ratio (" << fr << "), increased"; + detail->push_back(make_pair(HEALTH_ERR, ss.str())); + } + } + if (out_of_order) { + ostringstream ss; + ss << "Full ratio(s) out of order"; + summary.push_back(make_pair(HEALTH_ERR, ss.str())); + } + int full, backfill, nearfull; osdmap.count_full_nearfull_osds(&full, &backfill, &nearfull); if (full > 0) { From 252230786557353f9d52cf633b6400097f9daba3 Mon Sep 17 00:00:00 2001 From: David Zafman Date: Thu, 13 Apr 2017 20:42:55 -0700 Subject: [PATCH 23/24] mon, osd: Add detailed full information for now in the mon Show ceph health doc output in the correct order Signed-off-by: David Zafman --- .../troubleshooting/troubleshooting-osd.rst | 9 +++-- src/mon/OSDMonitor.cc | 33 +++++++++++++----- src/osd/OSDMap.cc | 34 +++++++++++++++++++ src/osd/OSDMap.h | 2 ++ 4 files changed, 65 insertions(+), 13 deletions(-) diff --git a/doc/rados/troubleshooting/troubleshooting-osd.rst b/doc/rados/troubleshooting/troubleshooting-osd.rst index 651907dfb05..fe29f4767f9 100644 --- a/doc/rados/troubleshooting/troubleshooting-osd.rst +++ b/doc/rados/troubleshooting/troubleshooting-osd.rst @@ -222,16 +222,15 @@ lowering the ``mon osd full ratio``, ``mon osd backfillfull ratio`` and Full ``ceph-osds`` will be reported by ``ceph health``:: ceph health - HEALTH_WARN 1 nearfull osds - osd.2 is near full at 85% + HEALTH_WARN 1 nearfull osd(s) Or:: - ceph health - HEALTH_ERR 1 nearfull osds, 1 backfillfull osds, 1 full osds - osd.2 is near full at 85% + ceph health detail + HEALTH_ERR 1 full osd(s); 1 backfillfull osd(s); 1 nearfull osd(s) osd.3 is full at 97% osd.4 is backfill full at 91% + osd.2 is near full at 87% The best way to deal with a full cluster is to add new ``ceph-osds``, allowing the cluster to redistribute data to the newly available storage. diff --git a/src/mon/OSDMonitor.cc b/src/mon/OSDMonitor.cc index 680e9e3a6fe..0b059271fbb 100644 --- a/src/mon/OSDMonitor.cc +++ b/src/mon/OSDMonitor.cc @@ -3402,23 +3402,40 @@ void OSDMonitor::get_health(list >& summary, summary.push_back(make_pair(HEALTH_ERR, ss.str())); } - int full, backfill, nearfull; - osdmap.count_full_nearfull_osds(&full, &backfill, &nearfull); - if (full > 0) { + map full, backfillfull, nearfull; + osdmap.get_full_osd_util(mon->pgmon()->pg_map.osd_stat, &full, &backfillfull, &nearfull); + if (full.size()) { ostringstream ss; - ss << full << " full osd(s)"; + ss << full.size() << " full osd(s)"; summary.push_back(make_pair(HEALTH_ERR, ss.str())); } - if (backfill > 0) { + if (backfillfull.size()) { ostringstream ss; - ss << backfill << " backfillfull osd(s)"; + ss << backfillfull.size() << " backfillfull osd(s)"; summary.push_back(make_pair(HEALTH_WARN, ss.str())); } - if (nearfull > 0) { + if (nearfull.size()) { ostringstream ss; - ss << nearfull << " nearfull osd(s)"; + ss << nearfull.size() << " nearfull osd(s)"; summary.push_back(make_pair(HEALTH_WARN, ss.str())); } + if (detail) { + for (auto& i: full) { + ostringstream ss; + ss << "osd." << i.first << " is full at " << roundf(i.second * 100) << "%"; + detail->push_back(make_pair(HEALTH_ERR, ss.str())); + } + for (auto& i: backfillfull) { + ostringstream ss; + ss << "osd." << i.first << " is backfill full at " << roundf(i.second * 100) << "%"; + detail->push_back(make_pair(HEALTH_WARN, ss.str())); + } + for (auto& i: nearfull) { + ostringstream ss; + ss << "osd." << i.first << " is near full at " << roundf(i.second * 100) << "%"; + detail->push_back(make_pair(HEALTH_WARN, ss.str())); + } + } } // note: we leave it to ceph-mgr to generate details health warnings // with actual osd utilizations diff --git a/src/osd/OSDMap.cc b/src/osd/OSDMap.cc index 6d0cbfe0a28..5035fab931a 100644 --- a/src/osd/OSDMap.cc +++ b/src/osd/OSDMap.cc @@ -1046,6 +1046,40 @@ void OSDMap::count_full_nearfull_osds(int *full, int *backfill, int *nearfull) c } } +static bool get_osd_utilization(const ceph::unordered_map &osd_stat, + int id, int64_t* kb, int64_t* kb_used, int64_t* kb_avail) { + auto p = osd_stat.find(id); + if (p == osd_stat.end()) + return false; + *kb = p->second.kb; + *kb_used = p->second.kb_used; + *kb_avail = p->second.kb_avail; + return *kb > 0; +} + +void OSDMap::get_full_osd_util(const ceph::unordered_map &osd_stat, + map *full, map *backfill, map *nearfull) const +{ + full->clear(); + backfill->clear(); + nearfull->clear(); + for (int i = 0; i < max_osd; ++i) { + if (exists(i) && is_up(i) && is_in(i)) { + int64_t kb, kb_used, kb_avail; + if (osd_state[i] & CEPH_OSD_FULL) { + if (get_osd_utilization(osd_stat, i, &kb, &kb_used, &kb_avail)) + full->emplace(i, (float)kb_used / (float)kb); + } else if (osd_state[i] & CEPH_OSD_BACKFILLFULL) { + if (get_osd_utilization(osd_stat, i, &kb, &kb_used, &kb_avail)) + backfill->emplace(i, (float)kb_used / (float)kb); + } else if (osd_state[i] & CEPH_OSD_NEARFULL) { + if (get_osd_utilization(osd_stat, i, &kb, &kb_used, &kb_avail)) + nearfull->emplace(i, (float)kb_used / (float)kb); + } + } + } +} + void OSDMap::get_all_osds(set& ls) const { for (int i=0; i &osd_stat, + map *full, map *backfill, map *nearfull) const; /***** cluster state *****/ /* osds */ From 3becdd313852e55a1b4081011b6096255b7149e4 Mon Sep 17 00:00:00 2001 From: David Zafman Date: Fri, 14 Apr 2017 17:36:17 -0700 Subject: [PATCH 24/24] test: Test health check output for full ratios Test out of order ratios summary and details Test various full osd conditions summary and details Signed-off-by: David Zafman --- qa/workunits/cephtool/test.sh | 33 +++++++++++++++++++++++++++++++++ 1 file changed, 33 insertions(+) diff --git a/qa/workunits/cephtool/test.sh b/qa/workunits/cephtool/test.sh index a6c7b617cc6..353e5f74518 100755 --- a/qa/workunits/cephtool/test.sh +++ b/qa/workunits/cephtool/test.sh @@ -1424,6 +1424,39 @@ function test_mon_pg() ceph osd set-nearfull-ratio .892 ceph osd dump | grep '^nearfull_ratio 0.892' + # Check health status + ceph osd set-nearfull-ratio .913 + ceph health | grep 'HEALTH_ERR Full ratio(s) out of order' + ceph health detail | grep 'backfill_ratio (0.912) < nearfull_ratio (0.913), increased' + ceph osd set-nearfull-ratio .892 + ceph osd set-backfillfull-ratio .963 + ceph health detail | grep 'full_ratio (0.962) < backfillfull_ratio (0.963), increased' + ceph osd set-backfillfull-ratio .912 + + # Check injected full results + WAITFORFULL=10 + ceph --admin-daemon $CEPH_OUT_DIR/osd.0.asok injectfull nearfull + sleep $WAITFORFULL + ceph health | grep "HEALTH_WARN.*1 nearfull osd(s)" + ceph --admin-daemon $CEPH_OUT_DIR/osd.1.asok injectfull backfillfull + sleep $WAITFORFULL + ceph health | grep "HEALTH_WARN.*1 backfillfull osd(s)" + ceph --admin-daemon $CEPH_OUT_DIR/osd.2.asok injectfull failsafe + sleep $WAITFORFULL + # failsafe and full are the same as far as the monitor is concerned + ceph health | grep "HEALTH_ERR.*1 full osd(s)" + ceph --admin-daemon $CEPH_OUT_DIR/osd.0.asok injectfull full + sleep $WAITFORFULL + ceph health | grep "HEALTH_ERR.*2 full osd(s)" + ceph health detail | grep "osd.0 is full at.*%" + ceph health detail | grep "osd.2 is full at.*%" + ceph health detail | grep "osd.1 is backfill full at.*%" + ceph --admin-daemon $CEPH_OUT_DIR/osd.0.asok injectfull none + ceph --admin-daemon $CEPH_OUT_DIR/osd.1.asok injectfull none + ceph --admin-daemon $CEPH_OUT_DIR/osd.2.asok injectfull none + sleep $WAITFORFULL + ceph health | grep HEALTH_OK + ceph pg stat | grep 'pgs:' ceph pg 0.0 query ceph tell 0.0 query