From 1e7d227bfc511d53efbe96c52bcd5af9c7cc08b4 Mon Sep 17 00:00:00 2001
From: David Zafman <dzafman@redhat.com>
Date: Tue, 28 Feb 2017 12:28:38 -0800
Subject: [PATCH 01/24] osd: Fix log message

Signed-off-by: David Zafman <dzafman@redhat.com>
---
 src/os/filestore/FileStore.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/os/filestore/FileStore.cc b/src/os/filestore/FileStore.cc
index 475d7f59481..3b0eabd12d5 100644
--- a/src/os/filestore/FileStore.cc
+++ b/src/os/filestore/FileStore.cc
@@ -3952,7 +3952,7 @@ void FileStore::sync_entry()
 	    derr << "ioctl WAIT_SYNC got " << cpp_strerror(err) << dendl;
 	    assert(0 == "wait_sync got error");
 	  }
-	  dout(20) << " done waiting for checkpoint" << cid << " to complete" << dendl;
+	  dout(20) << " done waiting for checkpoint " << cid << " to complete" << dendl;
 	}
       } else
       {

From 3a66f1fbf606157ea3f3ae96e2e9577a54a60886 Mon Sep 17 00:00:00 2001
From: David Zafman <dzafman@redhat.com>
Date: Mon, 3 Apr 2017 11:28:42 -0700
Subject: [PATCH 02/24] ceph-objectstore-tool: cleanup comment

Signed-off-by: David Zafman <dzafman@redhat.com>
---
 src/tools/ceph_objectstore_tool.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/tools/ceph_objectstore_tool.cc b/src/tools/ceph_objectstore_tool.cc
index 6b173f204d0..f9bf85a5d63 100644
--- a/src/tools/ceph_objectstore_tool.cc
+++ b/src/tools/ceph_objectstore_tool.cc
@@ -2906,7 +2906,7 @@ int main(int argc, char **argv)
 	    throw std::runtime_error(ss.str());
 	  }
 	  vector<json_spirit::Value>::iterator i = array.begin();
-	  //if (i == array.end() || i->type() != json_spirit::str_type) {
+	  assert(i != array.end());
 	  if (i->type() != json_spirit::str_type) {
 	    ss << "Object '" << object
 	       << "' must be a JSON array with the first element a string";

From 6c930f7e661de92533fac9b9bdd197f07b67ff3f Mon Sep 17 00:00:00 2001
From: David Zafman <dzafman@redhat.com>
Date: Thu, 16 Mar 2017 10:40:08 -0700
Subject: [PATCH 03/24] osd: Increase osd_backfill_retry_interval to 30 seconds

Signed-off-by: David Zafman <dzafman@redhat.com>
---
 src/common/config_opts.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/common/config_opts.h b/src/common/config_opts.h
index ae6fdf2ccb3..974ee7efd33 100644
--- a/src/common/config_opts.h
+++ b/src/common/config_opts.h
@@ -630,7 +630,7 @@ OPTION(osd_min_recovery_priority, OPT_INT, 0)
 OPTION(osd_backfill_full_ratio, OPT_FLOAT, 0.85)
 
 // Seconds to wait before retrying refused backfills
-OPTION(osd_backfill_retry_interval, OPT_DOUBLE, 10.0)
+OPTION(osd_backfill_retry_interval, OPT_DOUBLE, 30.0)
 
 // max agent flush ops
 OPTION(osd_agent_max_ops, OPT_INT, 4)

From d024ab05049f39741d05f9d202370b3d3ebfc731 Mon Sep 17 00:00:00 2001
From: David Zafman <dzafman@redhat.com>
Date: Thu, 16 Mar 2017 10:27:31 -0700
Subject: [PATCH 04/24] osd: Remove unused argument to clear_queued_recovery

Signed-off-by: David Zafman <dzafman@redhat.com>
---
 src/osd/OSD.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/osd/OSD.h b/src/osd/OSD.h
index 73998a100e5..fc60df7c65f 100644
--- a/src/osd/OSD.h
+++ b/src/osd/OSD.h
@@ -1025,7 +1025,7 @@ public:
     Mutex::Locker l(recovery_lock);
     _maybe_queue_recovery();
   }
-  void clear_queued_recovery(PG *pg, bool front = false) {
+  void clear_queued_recovery(PG *pg) {
     Mutex::Locker l(recovery_lock);
     for (list<pair<epoch_t, PGRef> >::iterator i = awaiting_throttle.begin();
 	 i != awaiting_throttle.end();

From 811f89a682e831f98d602c9887034cef517a009a Mon Sep 17 00:00:00 2001
From: David Zafman <dzafman@redhat.com>
Date: Fri, 7 Apr 2017 09:36:26 -0700
Subject: [PATCH 05/24] test: Switch from pg to osd for set-*-ratio commands

Testing of 6422e0a220fb3f32ccae50e0c7e52dc9984685c6

Signed-off-by: David Zafman <dzafman@redhat.com>
---
 src/test/pybind/test_ceph_argparse.py | 30 +++++++++++++--------------
 1 file changed, 15 insertions(+), 15 deletions(-)

diff --git a/src/test/pybind/test_ceph_argparse.py b/src/test/pybind/test_ceph_argparse.py
index 0608655b587..e9694064bd2 100755
--- a/src/test/pybind/test_ceph_argparse.py
+++ b/src/test/pybind/test_ceph_argparse.py
@@ -183,21 +183,6 @@ class TestPG(TestArgparse):
     def test_force_create_pg(self):
         self.one_pgid('force_create_pg')
 
-    def set_ratio(self, command):
-        self.assert_valid_command(['pg',
-                                   command,
-                                   '0.0'])
-        assert_equal({}, validate_command(sigdict, ['pg', command]))
-        assert_equal({}, validate_command(sigdict, ['pg',
-                                                    command,
-                                                    '2.0']))
-
-    def test_set_full_ratio(self):
-        self.set_ratio('set_full_ratio')
-
-    def test_set_nearfull_ratio(self):
-        self.set_ratio('set_nearfull_ratio')
-
 
 class TestAuth(TestArgparse):
 
@@ -1153,6 +1138,21 @@ class TestOSD(TestArgparse):
                                                     'poolname',
                                                     'toomany']))
 
+    def set_ratio(self, command):
+        self.assert_valid_command(['osd',
+                                   command,
+                                   '0.0'])
+        assert_equal({}, validate_command(sigdict, ['osd', command]))
+        assert_equal({}, validate_command(sigdict, ['osd',
+                                                    command,
+                                                    '2.0']))
+
+    def test_set_full_ratio(self):
+        self.set_ratio('set-full-ratio')
+
+    def test_set_nearfull_ratio(self):
+        self.set_ratio('set-nearfull-ratio')
+
 
 class TestConfigKey(TestArgparse):
 

From e927cd2cf23d97e0d49647f79acd2f3a95bd43c3 Mon Sep 17 00:00:00 2001
From: David Zafman <dzafman@redhat.com>
Date: Fri, 7 Apr 2017 11:52:57 -0700
Subject: [PATCH 06/24] test: Fix intended test flow and restore nearfull-ratio

This is inconsequential but seems to have always been wrong since original
commit 6cafb0e3e0bc5f992c2483a46cb00e83dca035cc

Signed-off-by: David Zafman <dzafman@redhat.com>
---
 qa/workunits/rest/test.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/qa/workunits/rest/test.py b/qa/workunits/rest/test.py
index 1208f85b907..1a7ab30b886 100755
--- a/qa/workunits/rest/test.py
+++ b/qa/workunits/rest/test.py
@@ -362,7 +362,7 @@ if __name__ == '__main__':
     expect('osd/set-nearfull-ratio?ratio=0.90', 'PUT', 200, '')
     r = expect('osd/dump', 'GET', 200, 'json', JSONHDR)
     assert(float(r.myjson['output']['nearfull_ratio']) == 0.90)
-    expect('osd/set-full-ratio?ratio=0.85', 'PUT', 200, '')
+    expect('osd/set-nearfull-ratio?ratio=0.85', 'PUT', 200, '')
 
     r = expect('pg/stat', 'GET', 200, 'json', JSONHDR)
     assert('num_pgs' in r.myjson['output'])

From 5baf7abfa36f7fb2a174c5255b0d4afc65df6388 Mon Sep 17 00:00:00 2001
From: David Zafman <dzafman@redhat.com>
Date: Thu, 30 Mar 2017 10:30:29 -0700
Subject: [PATCH 07/24] osd: Fail-safe full is a hard stop even for mds

We can't allow OSD to become non-startable even if mds
could be writing as part of file removals.

Signed-off-by: David Zafman <dzafman@redhat.com>
---
 src/osd/PrimaryLogPG.cc | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/src/osd/PrimaryLogPG.cc b/src/osd/PrimaryLogPG.cc
index 2f1599963c9..5fd0463401f 100644
--- a/src/osd/PrimaryLogPG.cc
+++ b/src/osd/PrimaryLogPG.cc
@@ -1888,7 +1888,10 @@ void PrimaryLogPG::do_op(OpRequestRef& op)
 	     << *m << dendl;
     return;
   }
-  if (!(m->get_source().is_mds()) && osd->check_failsafe_full() && write_ordered) {
+  // mds should have stopped writing before this point.
+  // We can't allow OSD to become non-startable even if mds
+  // could be writing as part of file removals.
+  if (write_ordered && osd->check_failsafe_full()) {
     dout(10) << __func__ << " fail-safe full check failed, dropping request"
 	     << dendl;
     return;

From 79124330c7e8b377125dd52e12275cf81b3c38e6 Mon Sep 17 00:00:00 2001
From: David Zafman <dzafman@redhat.com>
Date: Thu, 30 Mar 2017 11:17:13 -0700
Subject: [PATCH 08/24] osd: too_full_for_backfill() returns ostream for reason

Signed-off-by: David Zafman <dzafman@redhat.com>
---
 src/osd/OSD.cc          |  7 ++-----
 src/osd/OSD.h           |  2 +-
 src/osd/PG.cc           | 18 ++++++++----------
 src/osd/PrimaryLogPG.cc |  7 +++----
 4 files changed, 14 insertions(+), 20 deletions(-)

diff --git a/src/osd/OSD.cc b/src/osd/OSD.cc
index 7b1a021a228..167c0c1ff3a 100644
--- a/src/osd/OSD.cc
+++ b/src/osd/OSD.cc
@@ -823,15 +823,12 @@ bool OSDService::is_full()
   return cur_state >= FULL;
 }
 
-bool OSDService::too_full_for_backfill(double *_ratio, double *_max_ratio)
+bool OSDService::too_full_for_backfill(ostream &ss)
 {
   Mutex::Locker l(full_status_lock);
   double max_ratio;
   max_ratio = cct->_conf->osd_backfill_full_ratio;
-  if (_ratio)
-    *_ratio = cur_ratio;
-  if (_max_ratio)
-    *_max_ratio = max_ratio;
+  ss << "current usage is " << cur_ratio << ", which is greater than max allowed ratio " << max_ratio;
   return cur_ratio >= max_ratio;
 }
 
diff --git a/src/osd/OSD.h b/src/osd/OSD.h
index fc60df7c65f..e593d552a6b 100644
--- a/src/osd/OSD.h
+++ b/src/osd/OSD.h
@@ -1155,7 +1155,7 @@ public:
   bool check_failsafe_full();
   bool is_nearfull();
   bool is_full();
-  bool too_full_for_backfill(double *ratio, double *max_ratio);
+  bool too_full_for_backfill(ostream &ss);
   bool need_fullness_update();  ///< osdmap state needs update
 
 
diff --git a/src/osd/PG.cc b/src/osd/PG.cc
index 56b9c19bed8..7cee9141820 100644
--- a/src/osd/PG.cc
+++ b/src/osd/PG.cc
@@ -6554,18 +6554,17 @@ boost::statechart::result
 PG::RecoveryState::RepNotRecovering::react(const RequestBackfillPrio &evt)
 {
   PG *pg = context< RecoveryMachine >().pg;
-  double ratio, max_ratio;
+  ostringstream ss;
 
   if (pg->cct->_conf->osd_debug_reject_backfill_probability > 0 &&
       (rand()%1000 < (pg->cct->_conf->osd_debug_reject_backfill_probability*1000.0))) {
     ldout(pg->cct, 10) << "backfill reservation rejected: failure injection"
 		       << dendl;
     post_event(RemoteReservationRejected());
-  } else if (pg->osd->too_full_for_backfill(&ratio, &max_ratio) &&
+  } else if (pg->osd->too_full_for_backfill(ss) &&
       !pg->cct->_conf->osd_debug_skip_full_check_in_backfill_reservation) {
-    ldout(pg->cct, 10) << "backfill reservation rejected: full ratio is "
-		       << ratio << ", which is greater than max allowed ratio "
-		       << max_ratio << dendl;
+    ldout(pg->cct, 10) << "backfill reservation rejected: "
+		       << ss.str() << dendl;
     post_event(RemoteReservationRejected());
   } else {
     pg->osd->remote_reserver.request_reservation(
@@ -6590,7 +6589,7 @@ PG::RecoveryState::RepWaitBackfillReserved::react(const RemoteBackfillReserved &
 {
   PG *pg = context< RecoveryMachine >().pg;
 
-  double ratio, max_ratio;
+  ostringstream ss;
   if (pg->cct->_conf->osd_debug_reject_backfill_probability > 0 &&
       (rand()%1000 < (pg->cct->_conf->osd_debug_reject_backfill_probability*1000.0))) {
     ldout(pg->cct, 10) << "backfill reservation rejected after reservation: "
@@ -6598,11 +6597,10 @@ PG::RecoveryState::RepWaitBackfillReserved::react(const RemoteBackfillReserved &
     pg->osd->remote_reserver.cancel_reservation(pg->info.pgid);
     post_event(RemoteReservationRejected());
     return discard_event();
-  } else if (pg->osd->too_full_for_backfill(&ratio, &max_ratio) &&
+  } else if (pg->osd->too_full_for_backfill(ss) &&
 	     !pg->cct->_conf->osd_debug_skip_full_check_in_backfill_reservation) {
-    ldout(pg->cct, 10) << "backfill reservation rejected after reservation: full ratio is "
-		       << ratio << ", which is greater than max allowed ratio "
-		       << max_ratio << dendl;
+    ldout(pg->cct, 10) << "backfill reservation rejected after reservation: "
+		       << ss.str() << dendl;
     pg->osd->remote_reserver.cancel_reservation(pg->info.pgid);
     post_event(RemoteReservationRejected());
     return discard_event();
diff --git a/src/osd/PrimaryLogPG.cc b/src/osd/PrimaryLogPG.cc
index 5fd0463401f..553ffca191e 100644
--- a/src/osd/PrimaryLogPG.cc
+++ b/src/osd/PrimaryLogPG.cc
@@ -3331,10 +3331,9 @@ void PrimaryLogPG::do_scan(
   switch (m->op) {
   case MOSDPGScan::OP_SCAN_GET_DIGEST:
     {
-      double ratio, full_ratio;
-      if (osd->too_full_for_backfill(&ratio, &full_ratio)) {
-	dout(1) << __func__ << ": Canceling backfill, current usage is "
-		<< ratio << ", which exceeds " << full_ratio << dendl;
+      ostringstream ss;
+      if (osd->too_full_for_backfill(ss)) {
+	dout(1) << __func__ << ": Canceling backfill, " << ss.str() << dendl;
 	queue_peering_event(
 	  CephPeeringEvtRef(
 	    std::make_shared<CephPeeringEvt>(

From 79a4ac41c5fd6613e5b9a51de595a7043425916d Mon Sep 17 00:00:00 2001
From: David Zafman <dzafman@redhat.com>
Date: Wed, 29 Mar 2017 17:35:51 -0700
Subject: [PATCH 09/24] common: Remove unused config option
 osd_recovery_threads

Signed-off-by: David Zafman <dzafman@redhat.com>
---
 doc/rados/configuration/osd-config-ref.rst | 7 -------
 doc/rados/operations/monitoring-osd-pg.rst | 3 +--
 src/common/config_opts.h                   | 1 -
 3 files changed, 1 insertion(+), 10 deletions(-)

diff --git a/doc/rados/configuration/osd-config-ref.rst b/doc/rados/configuration/osd-config-ref.rst
index 06e46a6eab9..d68316dff60 100644
--- a/doc/rados/configuration/osd-config-ref.rst
+++ b/doc/rados/configuration/osd-config-ref.rst
@@ -673,13 +673,6 @@ perform well in a degraded state.
 :Default: ``8 << 20`` 
 
 
-``osd recovery threads`` 
-
-:Description: The number of threads for recovering data.
-:Type: 32-bit Integer
-:Default: ``1``
-
-
 ``osd recovery thread timeout`` 
 
 :Description: The maximum time in seconds before timing out a recovery thread.
diff --git a/doc/rados/operations/monitoring-osd-pg.rst b/doc/rados/operations/monitoring-osd-pg.rst
index 866ae8313ba..37f1960cba6 100644
--- a/doc/rados/operations/monitoring-osd-pg.rst
+++ b/doc/rados/operations/monitoring-osd-pg.rst
@@ -468,8 +468,7 @@ Ceph provides a number of settings to balance the resource contention between
 new service requests and the need to recover data objects and restore the
 placement groups to the current state. The ``osd recovery delay start`` setting
 allows an OSD to restart, re-peer and even process some replay requests before
-starting the recovery process. The ``osd recovery threads`` setting limits the
-number of threads for the recovery process (1 thread by default).  The ``osd
+starting the recovery process.  The ``osd
 recovery thread timeout`` sets a thread timeout, because multiple OSDs may fail,
 restart and re-peer at staggered rates. The ``osd recovery max active`` setting
 limits the  number of recovery requests an OSD will entertain simultaneously to
diff --git a/src/common/config_opts.h b/src/common/config_opts.h
index 974ee7efd33..3de8c42ee89 100644
--- a/src/common/config_opts.h
+++ b/src/common/config_opts.h
@@ -742,7 +742,6 @@ OPTION(osd_op_pq_min_cost, OPT_U64, 65536)
 OPTION(osd_disk_threads, OPT_INT, 1)
 OPTION(osd_disk_thread_ioprio_class, OPT_STR, "") // rt realtime be best effort idle
 OPTION(osd_disk_thread_ioprio_priority, OPT_INT, -1) // 0-7
-OPTION(osd_recovery_threads, OPT_INT, 1)
 OPTION(osd_recover_clone_overlap, OPT_BOOL, true)   // preserve clone_overlap during recovery/migration
 OPTION(osd_op_num_threads_per_shard, OPT_INT, 2)
 OPTION(osd_op_num_shards, OPT_INT, 5)

From 9dd695299901174babaffc7043a69c35f5302c66 Mon Sep 17 00:00:00 2001
From: David Zafman <dzafman@redhat.com>
Date: Mon, 3 Apr 2017 15:46:37 -0700
Subject: [PATCH 10/24] common: Bump ratio for backfillfull from 85% to 90%

Signed-off-by: David Zafman <dzafman@redhat.com>
---
 src/common/config_opts.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/common/config_opts.h b/src/common/config_opts.h
index 3de8c42ee89..a3e74b84860 100644
--- a/src/common/config_opts.h
+++ b/src/common/config_opts.h
@@ -627,7 +627,7 @@ OPTION(osd_max_backfills, OPT_U64, 1)
 OPTION(osd_min_recovery_priority, OPT_INT, 0)
 
 // Refuse backfills when OSD full ratio is above this value
-OPTION(osd_backfill_full_ratio, OPT_FLOAT, 0.85)
+OPTION(osd_backfill_full_ratio, OPT_FLOAT, 0.90)
 
 // Seconds to wait before retrying refused backfills
 OPTION(osd_backfill_retry_interval, OPT_DOUBLE, 30.0)

From 0264bbddb7616e55e7ea90b6f5a834275ebe8985 Mon Sep 17 00:00:00 2001
From: David Zafman <dzafman@redhat.com>
Date: Thu, 30 Mar 2017 11:18:38 -0700
Subject: [PATCH 11/24] osd: For testing full disks add injectfull socket
 command

Signed-off-by: David Zafman <dzafman@redhat.com>
---
 src/osd/OSD.cc | 33 ++++++++++++++++++++++++++++++++-
 src/osd/OSD.h  |  1 +
 2 files changed, 33 insertions(+), 1 deletion(-)

diff --git a/src/osd/OSD.cc b/src/osd/OSD.cc
index 167c0c1ff3a..8799b7933a3 100644
--- a/src/osd/OSD.cc
+++ b/src/osd/OSD.cc
@@ -221,6 +221,7 @@ OSDService::OSDService(OSD *osd) :
   whoami(osd->whoami), store(osd->store),
   log_client(osd->log_client), clog(osd->clog),
   pg_recovery_stats(osd->pg_recovery_stats),
+  injectfull(0),
   cluster_messenger(osd->cluster_messenger),
   client_messenger(osd->client_messenger),
   logger(osd->logger),
@@ -753,9 +754,10 @@ void OSDService::check_full_status(const osd_stat_t &osd_stat)
   }
 
   enum s_names new_state;
+  // If testing with injectfull, let's keep determined state as FAILSAFE
   if (ratio > failsafe_ratio) {
     new_state = FAILSAFE;
-  } else if (ratio > full_ratio) {
+  } else if (ratio > full_ratio || injectfull) {
     new_state = FULL;
   } else if (ratio > nearfull_ratio) {
     new_state = NEARFULL;
@@ -806,6 +808,16 @@ bool OSDService::need_fullness_update()
 bool OSDService::check_failsafe_full()
 {
   Mutex::Locker l(full_status_lock);
+
+  if (injectfull) {
+    // injectfull is either a count of the number of times to return full
+    // or if -1 then always return full
+    if (injectfull > 0)
+      --injectfull;
+    dout(5) << __func__ << " Injected full OSD (" << (injectfull < 0 ? "set" : std::to_string(injectfull)) << ")" << dendl;
+    return true;
+  }
+
   if (cur_state == FAILSAFE)
     return true;
   return false;
@@ -826,6 +838,14 @@ bool OSDService::is_full()
 bool OSDService::too_full_for_backfill(ostream &ss)
 {
   Mutex::Locker l(full_status_lock);
+  if (injectfull) {
+    // injectfull is either a count of the number of times to return full
+    // or if -1 then always return full
+    if (injectfull > 0)
+      --injectfull;
+    ss << "Injected full OSD (" << (injectfull < 0 ? string("set") : std::to_string(injectfull)) << ")";
+    return true;
+  }
   double max_ratio;
   max_ratio = cct->_conf->osd_backfill_full_ratio;
   ss << "current usage is " << cur_ratio << ", which is greater than max allowed ratio " << max_ratio;
@@ -2629,6 +2649,13 @@ void OSD::final_init()
    test_ops_hook,
    "Trigger a scheduled scrub ");
   assert(r == 0);
+  r = admin_socket->register_command(
+   "injectfull",
+   "injectfull " \
+   "name=count,type=CephInt,req=false ",
+   test_ops_hook,
+   "Inject a full disk (optional count times)");
+  assert(r == 0);
 }
 
 void OSD::create_logger()
@@ -4851,6 +4878,10 @@ void TestOpsSocketHook::test_ops(OSDService *service, ObjectStore *store,
     pg->unlock();
     return;
   }
+  if (command == "injectfull") {
+    cmd_getval(service->cct, cmdmap, "count", service->injectfull, (int64_t)-1);
+    return;
+  }
   ss << "Internal error - command=" << command;
 }
 
diff --git a/src/osd/OSD.h b/src/osd/OSD.h
index e593d552a6b..29bacb60879 100644
--- a/src/osd/OSD.h
+++ b/src/osd/OSD.h
@@ -461,6 +461,7 @@ public:
   LogClient &log_client;
   LogChannelRef clog;
   PGRecoveryStats &pg_recovery_stats;
+  int64_t injectfull;
 private:
   Messenger *&cluster_messenger;
   Messenger *&client_messenger;

From a5731076add0af10686da482ecc29a1fa2600a14 Mon Sep 17 00:00:00 2001
From: David Zafman <dzafman@redhat.com>
Date: Thu, 30 Mar 2017 12:42:54 -0700
Subject: [PATCH 12/24] osd: Handle backfillfull_ratio just like nearfull and
 full

Add BACKFILLFULL as a local OSD cur_state
Notify monitor of this new fullness state

Signed-off-by: David Zafman <dzafman@redhat.com>
---
 .../osd_internals/recovery_reservation.rst    |  6 +--
 doc/man/8/ceph.rst                            |  6 +++
 doc/rados/configuration/mon-config-ref.rst    | 10 +++++
 doc/rados/configuration/osd-config-ref.rst    |  9 ----
 doc/rados/operations/monitoring-osd-pg.rst    |  5 ++-
 .../troubleshooting/troubleshooting-osd.rst   | 10 +++--
 qa/tasks/ceph_manager.py                      | 14 ++++---
 qa/workunits/ceph-helpers.sh                  |  2 +-
 qa/workunits/cephtool/test.sh                 |  2 +
 qa/workunits/rest/test.py                     |  4 ++
 src/common/ceph_strings.cc                    |  2 +
 src/common/config_opts.h                      |  4 +-
 src/include/rados.h                           |  1 +
 src/mon/MonCommands.h                         |  4 ++
 src/mon/OSDMonitor.cc                         | 30 +++++++++----
 src/osd/OSD.cc                                | 42 ++++++++++++-------
 src/osd/OSD.h                                 |  4 +-
 src/osd/OSDMap.cc                             | 31 +++++++++++---
 src/osd/OSDMap.h                              |  8 +++-
 src/test/cli/osdmaptool/clobber.t             |  2 +
 src/test/cli/osdmaptool/create-print.t        |  1 +
 src/test/cli/osdmaptool/create-racks.t        |  1 +
 src/test/pybind/test_ceph_argparse.py         |  3 ++
 src/tools/ceph_monstore_tool.cc               |  8 ++++
 24 files changed, 151 insertions(+), 58 deletions(-)

diff --git a/doc/dev/osd_internals/recovery_reservation.rst b/doc/dev/osd_internals/recovery_reservation.rst
index 24db1387f50..cabea04cc73 100644
--- a/doc/dev/osd_internals/recovery_reservation.rst
+++ b/doc/dev/osd_internals/recovery_reservation.rst
@@ -34,8 +34,8 @@ the typical process.
 
 Once the primary has its local reservation, it requests a remote
 reservation from the backfill target. This reservation CAN be rejected,
-for instance if the OSD is too full (osd_backfill_full_ratio config
-option). If the reservation is rejected, the primary drops its local
+for instance if the OSD is too full (backfillfull_ratio osd setting).
+If the reservation is rejected, the primary drops its local
 reservation, waits (osd_backfill_retry_interval), and then retries. It
 will retry indefinitely.
 
@@ -64,7 +64,7 @@ to the monitor. The state chart can set:
  - recovering: recovering
  - backfill_wait: waiting for remote backfill reservations
  - backfilling: backfilling
- - backfill_toofull: backfill reservation rejected, OSD too full
+ - backfill_toofull: backfill stopped, OSD(s) above backfillfull ratio
 
 
 --------
diff --git a/doc/man/8/ceph.rst b/doc/man/8/ceph.rst
index f878f882525..b2489126848 100644
--- a/doc/man/8/ceph.rst
+++ b/doc/man/8/ceph.rst
@@ -1166,6 +1166,12 @@ Usage::
 
 	ceph pg set_full_ratio <float[0.0-1.0]>
 
+Subcommand ``set_backfillfull_ratio`` sets ratio at which pgs are considered too full to backfill.
+
+Usage::
+
+	ceph pg set_backfillfull_ratio <float[0.0-1.0]>
+
 Subcommand ``set_nearfull_ratio`` sets ratio at which pgs are considered nearly
 full.
 
diff --git a/doc/rados/configuration/mon-config-ref.rst b/doc/rados/configuration/mon-config-ref.rst
index 8c05571c6ce..b19461f7a62 100644
--- a/doc/rados/configuration/mon-config-ref.rst
+++ b/doc/rados/configuration/mon-config-ref.rst
@@ -400,6 +400,7 @@ a reasonable number for a near full ratio.
 	[global]
 		
 		mon osd full ratio = .80
+		mon osd backfillfull ratio = .75
 		mon osd nearfull ratio = .70
 
 
@@ -412,6 +413,15 @@ a reasonable number for a near full ratio.
 :Default: ``.95``
 
 
+``mon osd backfillfull ratio``
+
+:Description: The percentage of disk space used before an OSD is
+              considered too ``full`` to backfill.
+
+:Type: Float
+:Default: ``.90``
+
+
 ``mon osd nearfull ratio`` 
 
 :Description: The percentage of disk space used before an OSD is 
diff --git a/doc/rados/configuration/osd-config-ref.rst b/doc/rados/configuration/osd-config-ref.rst
index d68316dff60..5679c0caeeb 100644
--- a/doc/rados/configuration/osd-config-ref.rst
+++ b/doc/rados/configuration/osd-config-ref.rst
@@ -560,15 +560,6 @@ priority than requests to read or write data.
 :Default: ``512`` 
 
 
-``osd backfill full ratio``
-
-:Description: Refuse to accept backfill requests when the Ceph OSD Daemon's 
-              full ratio is above this value.
-
-:Type: Float
-:Default: ``0.85``
-
-
 ``osd backfill retry interval``
 
 :Description: The number of seconds to wait before retrying backfill requests.
diff --git a/doc/rados/operations/monitoring-osd-pg.rst b/doc/rados/operations/monitoring-osd-pg.rst
index 37f1960cba6..b390b030b71 100644
--- a/doc/rados/operations/monitoring-osd-pg.rst
+++ b/doc/rados/operations/monitoring-osd-pg.rst
@@ -496,8 +496,9 @@ placement group can't be backfilled, it may be considered ``incomplete``.
 Ceph provides a number of settings to manage the load spike associated with
 reassigning placement groups to an OSD (especially a new OSD). By default,
 ``osd_max_backfills`` sets the maximum number of concurrent backfills to or from
-an OSD to 10. The ``osd backfill full ratio`` enables an OSD to refuse a
-backfill request if the OSD is approaching its full ratio (85%, by default).
+an OSD to 10. The ``backfill full ratio`` enables an OSD to refuse a
+backfill request if the OSD is approaching its full ratio (90%, by default) and
+change with ``ceph osd set-backfillfull-ratio`` comand.
 If an OSD refuses a backfill request, the ``osd backfill retry interval``
 enables an OSD to retry the request (after 10 seconds, by default). OSDs can
 also set ``osd backfill scan min`` and ``osd backfill scan max`` to manage scan
diff --git a/doc/rados/troubleshooting/troubleshooting-osd.rst b/doc/rados/troubleshooting/troubleshooting-osd.rst
index 3661f8af45d..651907dfb05 100644
--- a/doc/rados/troubleshooting/troubleshooting-osd.rst
+++ b/doc/rados/troubleshooting/troubleshooting-osd.rst
@@ -206,7 +206,9 @@ Ceph prevents you from writing to a full OSD so that you don't lose data.
 In an operational cluster, you should receive a warning when your cluster
 is getting near its full ratio. The ``mon osd full ratio`` defaults to
 ``0.95``, or 95% of capacity before it stops clients from writing data.
-The ``mon osd nearfull ratio`` defaults to ``0.85``, or 85% of capacity
+The ``mon osd backfillfull ratio`` defaults to ``0.90``, or 90 % of
+capacity when it blocks backfills from starting. The
+``mon osd nearfull ratio`` defaults to ``0.85``, or 85% of capacity
 when it generates a health warning.
 
 Full cluster issues usually arise when testing how Ceph handles an OSD
@@ -214,7 +216,8 @@ failure on a small cluster. When one node has a high percentage of the
 cluster's data, the cluster can easily eclipse its nearfull and full ratio
 immediately. If you are testing how Ceph reacts to OSD failures on a small
 cluster, you should leave ample free disk space and consider temporarily
-lowering the ``mon osd full ratio`` and ``mon osd nearfull ratio``.
+lowering the ``mon osd full ratio``, ``mon osd backfillfull ratio``  and
+``mon osd nearfull ratio``.
 
 Full ``ceph-osds`` will be reported by ``ceph health``::
 
@@ -225,9 +228,10 @@ Full ``ceph-osds`` will be reported by ``ceph health``::
 Or::
 
 	ceph health
-	HEALTH_ERR 1 nearfull osds, 1 full osds
+	HEALTH_ERR 1 nearfull osds, 1 backfillfull osds, 1 full osds
 	osd.2 is near full at 85%
 	osd.3 is full at 97%
+	osd.4 is backfill full at 91%
 
 The best way to deal with a full cluster is to add new ``ceph-osds``, allowing
 the cluster to redistribute data to the newly available storage.
diff --git a/qa/tasks/ceph_manager.py b/qa/tasks/ceph_manager.py
index 8ff2556a7a0..1a9aff93c3c 100644
--- a/qa/tasks/ceph_manager.py
+++ b/qa/tasks/ceph_manager.py
@@ -696,7 +696,7 @@ class Thrasher:
         """
         Test backfills stopping when the replica fills up.
 
-        First, use osd_backfill_full_ratio to simulate a now full
+        First, use injectfull admin command to simulate a now full
         osd by setting it to 0 on all of the OSDs.
 
         Second, on a random subset, set
@@ -705,13 +705,14 @@ class Thrasher:
 
         Then, verify that all backfills stop.
         """
-        self.log("injecting osd_backfill_full_ratio = 0")
+        self.log("injecting backfill full")
         for i in self.live_osds:
             self.ceph_manager.set_config(
                 i,
                 osd_debug_skip_full_check_in_backfill_reservation=
-                random.choice(['false', 'true']),
-                osd_backfill_full_ratio=0)
+                random.choice(['false', 'true']))
+            self.ceph_manager.osd_admin_socket(i, command=['injectfull', 'backfillfull'],
+                                     check_status=True, timeout=30, stdout=DEVNULL)
         for i in range(30):
             status = self.ceph_manager.compile_pg_status()
             if 'backfill' not in status.keys():
@@ -724,8 +725,9 @@ class Thrasher:
         for i in self.live_osds:
             self.ceph_manager.set_config(
                 i,
-                osd_debug_skip_full_check_in_backfill_reservation='false',
-                osd_backfill_full_ratio=0.85)
+                osd_debug_skip_full_check_in_backfill_reservation='false')
+            self.ceph_manager.osd_admin_socket(i, command=['injectfull', 'none'],
+                                     check_status=True, timeout=30, stdout=DEVNULL)
 
     def test_map_discontinuity(self):
         """
diff --git a/qa/workunits/ceph-helpers.sh b/qa/workunits/ceph-helpers.sh
index 9863668de75..8642d376a73 100755
--- a/qa/workunits/ceph-helpers.sh
+++ b/qa/workunits/ceph-helpers.sh
@@ -400,6 +400,7 @@ EOF
     if test -z "$(get_config mon $id mon_initial_members)" ; then
         ceph osd pool delete rbd rbd --yes-i-really-really-mean-it || return 1
         ceph osd pool create rbd $PG_NUM || return 1
+        ceph osd set-backfillfull-ratio .99
     fi
 }
 
@@ -634,7 +635,6 @@ function activate_osd() {
     ceph_disk_args+=" --prepend-to-path="
 
     local ceph_args="$CEPH_ARGS"
-    ceph_args+=" --osd-backfill-full-ratio=.99"
     ceph_args+=" --osd-failsafe-full-ratio=.99"
     ceph_args+=" --osd-journal-size=100"
     ceph_args+=" --osd-scrub-load-threshold=2000"
diff --git a/qa/workunits/cephtool/test.sh b/qa/workunits/cephtool/test.sh
index 8d0ce1baeff..a6c7b617cc6 100755
--- a/qa/workunits/cephtool/test.sh
+++ b/qa/workunits/cephtool/test.sh
@@ -1419,6 +1419,8 @@ function test_mon_pg()
 
   ceph osd set-full-ratio .962
   ceph osd dump | grep '^full_ratio 0.962'
+  ceph osd set-backfillfull-ratio .912
+  ceph osd dump | grep '^backfillfull_ratio 0.912'
   ceph osd set-nearfull-ratio .892
   ceph osd dump | grep '^nearfull_ratio 0.892'
 
diff --git a/qa/workunits/rest/test.py b/qa/workunits/rest/test.py
index 1a7ab30b886..7bbb6f3ccae 100755
--- a/qa/workunits/rest/test.py
+++ b/qa/workunits/rest/test.py
@@ -359,6 +359,10 @@ if __name__ == '__main__':
     r = expect('osd/dump', 'GET', 200, 'json', JSONHDR)
     assert(float(r.myjson['output']['full_ratio']) == 0.90)
     expect('osd/set-full-ratio?ratio=0.95', 'PUT', 200, '')
+    expect('osd/set-backfillfull-ratio?ratio=0.88', 'PUT', 200, '')
+    r = expect('osd/dump', 'GET', 200, 'json', JSONHDR)
+    assert(float(r.myjson['output']['backfillfull_ratio']) == 0.88)
+    expect('osd/set-backfillfull-ratio?ratio=0.90', 'PUT', 200, '')
     expect('osd/set-nearfull-ratio?ratio=0.90', 'PUT', 200, '')
     r = expect('osd/dump', 'GET', 200, 'json', JSONHDR)
     assert(float(r.myjson['output']['nearfull_ratio']) == 0.90)
diff --git a/src/common/ceph_strings.cc b/src/common/ceph_strings.cc
index 462dd6db249..1fec2f7b0a1 100644
--- a/src/common/ceph_strings.cc
+++ b/src/common/ceph_strings.cc
@@ -42,6 +42,8 @@ const char *ceph_osd_state_name(int s)
 		return "full";
 	case CEPH_OSD_NEARFULL:
 		return "nearfull";
+	case CEPH_OSD_BACKFILLFULL:
+		return "backfillfull";
 	default:
 		return "???";
 	}	
diff --git a/src/common/config_opts.h b/src/common/config_opts.h
index a3e74b84860..9eed4d485e2 100644
--- a/src/common/config_opts.h
+++ b/src/common/config_opts.h
@@ -308,6 +308,7 @@ OPTION(mon_pg_warn_min_pool_objects, OPT_INT, 1000)  // do not warn on pools bel
 OPTION(mon_pg_check_down_all_threshold, OPT_FLOAT, .5) // threshold of down osds after which we check all pgs
 OPTION(mon_cache_target_full_warn_ratio, OPT_FLOAT, .66) // position between pool cache_target_full and max where we start warning
 OPTION(mon_osd_full_ratio, OPT_FLOAT, .95) // what % full makes an OSD "full"
+OPTION(mon_osd_backfillfull_ratio, OPT_FLOAT, .90) // what % full makes an OSD backfill full (backfill halted)
 OPTION(mon_osd_nearfull_ratio, OPT_FLOAT, .85) // what % full makes an OSD near full
 OPTION(mon_allow_pool_delete, OPT_BOOL, false) // allow pool deletion
 OPTION(mon_globalid_prealloc, OPT_U32, 10000)   // how many globalids to prealloc
@@ -626,9 +627,6 @@ OPTION(osd_max_backfills, OPT_U64, 1)
 // Minimum recovery priority (255 = max, smaller = lower)
 OPTION(osd_min_recovery_priority, OPT_INT, 0)
 
-// Refuse backfills when OSD full ratio is above this value
-OPTION(osd_backfill_full_ratio, OPT_FLOAT, 0.90)
-
 // Seconds to wait before retrying refused backfills
 OPTION(osd_backfill_retry_interval, OPT_DOUBLE, 30.0)
 
diff --git a/src/include/rados.h b/src/include/rados.h
index c8bc8ac4c0c..986b42969c4 100644
--- a/src/include/rados.h
+++ b/src/include/rados.h
@@ -116,6 +116,7 @@ struct ceph_eversion {
 #define CEPH_OSD_NEW     (1<<3)  /* osd is new, never marked in */
 #define CEPH_OSD_FULL    (1<<4)  /* osd is at or above full threshold */
 #define CEPH_OSD_NEARFULL (1<<5) /* osd is at or above nearfull threshold */
+#define CEPH_OSD_BACKFILLFULL (1<<6) /* osd is at or above backfillfull threshold */
 
 extern const char *ceph_osd_state_name(int s);
 
diff --git a/src/mon/MonCommands.h b/src/mon/MonCommands.h
index 4e816890820..d1b09e66024 100644
--- a/src/mon/MonCommands.h
+++ b/src/mon/MonCommands.h
@@ -592,6 +592,10 @@ COMMAND("osd set-full-ratio " \
 	"name=ratio,type=CephFloat,range=0.0|1.0", \
 	"set usage ratio at which OSDs are marked full",
 	"osd", "rw", "cli,rest")
+COMMAND("osd set-backfillfull-ratio " \
+	"name=ratio,type=CephFloat,range=0.0|1.0", \
+	"set usage ratio at which OSDs are marked too full to backfill",
+	"osd", "rw", "cli,rest")
 COMMAND("osd set-nearfull-ratio " \
 	"name=ratio,type=CephFloat,range=0.0|1.0", \
 	"set usage ratio at which OSDs are marked near-full",
diff --git a/src/mon/OSDMonitor.cc b/src/mon/OSDMonitor.cc
index 7daca9c887d..3566c3f7330 100644
--- a/src/mon/OSDMonitor.cc
+++ b/src/mon/OSDMonitor.cc
@@ -164,6 +164,7 @@ void OSDMonitor::create_initial()
   if (!g_conf->mon_debug_no_require_luminous) {
     newmap.set_flag(CEPH_OSDMAP_REQUIRE_LUMINOUS);
     newmap.full_ratio = g_conf->mon_osd_full_ratio;
+    newmap.backfillfull_ratio = g_conf->mon_osd_backfillfull_ratio;
     newmap.nearfull_ratio = g_conf->mon_osd_nearfull_ratio;
   }
 
@@ -784,8 +785,15 @@ void OSDMonitor::create_pending()
   OSDMap::clean_temps(g_ceph_context, osdmap, &pending_inc);
   dout(10) << "create_pending  did clean_temps" << dendl;
 
+  // On upgrade OSDMap has new field set by mon_osd_backfillfull_ratio config
+  // instead of osd_backfill_full_ratio config
+  if (osdmap.backfillfull_ratio <= 0) {
+    dout(1) << __func__ << " setting backfillfull_ratio = "
+	    << g_conf->mon_osd_backfillfull_ratio << dendl;
+    pending_inc.new_backfillfull_ratio = g_conf->mon_osd_backfillfull_ratio;
+  }
   if (!osdmap.test_flag(CEPH_OSDMAP_REQUIRE_LUMINOUS)) {
-    // transition nearfull ratios from PGMap to OSDMap (on upgrade)
+    // transition full ratios from PGMap to OSDMap (on upgrade)
     PGMap *pg_map = &mon->pgmon()->pg_map;
     if (osdmap.full_ratio != pg_map->full_ratio) {
       dout(10) << __func__ << " full_ratio " << osdmap.full_ratio
@@ -1048,8 +1056,8 @@ void OSDMonitor::encode_pending(MonitorDBStore::TransactionRef t)
     tmp.apply_incremental(pending_inc);
 
     if (tmp.test_flag(CEPH_OSDMAP_REQUIRE_LUMINOUS)) {
-      int full, nearfull;
-      tmp.count_full_nearfull_osds(&full, &nearfull);
+      int full, backfill, nearfull;
+      tmp.count_full_nearfull_osds(&full, &backfill, &nearfull);
       if (full > 0) {
 	if (!tmp.test_flag(CEPH_OSDMAP_FULL)) {
 	  dout(10) << __func__ << " setting full flag" << dendl;
@@ -2287,7 +2295,7 @@ bool OSDMonitor::preprocess_full(MonOpRequestRef op)
   MOSDFull *m = static_cast<MOSDFull*>(op->get_req());
   int from = m->get_orig_source().num();
   set<string> state;
-  unsigned mask = CEPH_OSD_NEARFULL | CEPH_OSD_FULL;
+  unsigned mask = CEPH_OSD_NEARFULL | CEPH_OSD_BACKFILLFULL | CEPH_OSD_FULL;
 
   // check permissions, ignore if failed
   MonSession *session = m->get_session();
@@ -2337,7 +2345,7 @@ bool OSDMonitor::prepare_full(MonOpRequestRef op)
   const MOSDFull *m = static_cast<MOSDFull*>(op->get_req());
   const int from = m->get_orig_source().num();
 
-  const unsigned mask = CEPH_OSD_NEARFULL | CEPH_OSD_FULL;
+  const unsigned mask = CEPH_OSD_NEARFULL | CEPH_OSD_BACKFILLFULL | CEPH_OSD_FULL;
   const unsigned want_state = m->state & mask;  // safety first
 
   unsigned cur_state = osdmap.get_state(from);
@@ -3342,13 +3350,18 @@ void OSDMonitor::get_health(list<pair<health_status_t,string> >& summary,
     }
 
     if (osdmap.test_flag(CEPH_OSDMAP_REQUIRE_LUMINOUS)) {
-      int full, nearfull;
-      osdmap.count_full_nearfull_osds(&full, &nearfull);
+      int full, backfill, nearfull;
+      osdmap.count_full_nearfull_osds(&full, &backfill, &nearfull);
       if (full > 0) {
 	ostringstream ss;
 	ss << full << " full osd(s)";
 	summary.push_back(make_pair(HEALTH_ERR, ss.str()));
       }
+      if (backfill > 0) {
+	ostringstream ss;
+	ss << backfill << " backfillfull osd(s)";
+	summary.push_back(make_pair(HEALTH_WARN, ss.str()));
+      }
       if (nearfull > 0) {
 	ostringstream ss;
 	ss << nearfull << " nearfull osd(s)";
@@ -6929,6 +6942,7 @@ bool OSDMonitor::prepare_command_impl(MonOpRequestRef op,
     return true;
 
   } else if (prefix == "osd set-full-ratio" ||
+	     prefix == "osd set-backfillfull-ratio" ||
              prefix == "osd set-nearfull-ratio") {
     if (!osdmap.test_flag(CEPH_OSDMAP_REQUIRE_LUMINOUS)) {
       ss << "you must complete the upgrade and set require_luminous_osds before"
@@ -6945,6 +6959,8 @@ bool OSDMonitor::prepare_command_impl(MonOpRequestRef op,
     }
     if (prefix == "osd set-full-ratio")
       pending_inc.new_full_ratio = n;
+    else if (prefix == "osd set-backfillfull-ratio")
+      pending_inc.new_backfillfull_ratio = n;
     else if (prefix == "osd set-nearfull-ratio")
       pending_inc.new_nearfull_ratio = n;
     ss << prefix << " " << n;
diff --git a/src/osd/OSD.cc b/src/osd/OSD.cc
index 8799b7933a3..1368c9d654e 100644
--- a/src/osd/OSD.cc
+++ b/src/osd/OSD.cc
@@ -736,20 +736,24 @@ void OSDService::check_full_status(const osd_stat_t &osd_stat)
     return;
   }
   float nearfull_ratio = osdmap->get_nearfull_ratio();
-  float full_ratio = std::max(osdmap->get_full_ratio(), nearfull_ratio);
+  float backfillfull_ratio = std::max(osdmap->get_backfillfull_ratio(), nearfull_ratio);
+  float full_ratio = std::max(osdmap->get_full_ratio(), backfillfull_ratio);
   float failsafe_ratio = std::max(get_failsafe_full_ratio(), full_ratio);
 
   if (!osdmap->test_flag(CEPH_OSDMAP_REQUIRE_LUMINOUS)) {
     // use the failsafe for nearfull and full; the mon isn't using the
     // flags anyway because we're mid-upgrade.
     full_ratio = failsafe_ratio;
+    backfillfull_ratio = failsafe_ratio;
     nearfull_ratio = failsafe_ratio;
   } else if (full_ratio <= 0 ||
+	     backfillfull_ratio <= 0 ||
 	     nearfull_ratio <= 0) {
-    derr << __func__ << " full_ratio or nearfull_ratio is <= 0" << dendl;
+    derr << __func__ << " full_ratio, backfillfull_ratio or nearfull_ratio is <= 0" << dendl;
     // use failsafe flag.  ick.  the monitor did something wrong or the user
     // did something stupid.
     full_ratio = failsafe_ratio;
+    backfillfull_ratio = failsafe_ratio;
     nearfull_ratio = failsafe_ratio;
   }
 
@@ -759,6 +763,8 @@ void OSDService::check_full_status(const osd_stat_t &osd_stat)
     new_state = FAILSAFE;
   } else if (ratio > full_ratio || injectfull) {
     new_state = FULL;
+  } else if (ratio > backfillfull_ratio) {
+    new_state = BACKFILLFULL;
   } else if (ratio > nearfull_ratio) {
     new_state = NEARFULL;
   } else {
@@ -766,6 +772,7 @@ void OSDService::check_full_status(const osd_stat_t &osd_stat)
   }
   dout(20) << __func__ << " cur ratio " << ratio
 	   << ". nearfull_ratio " << nearfull_ratio
+	   << ". backfillfull_ratio " << backfillfull_ratio
 	   << ", full_ratio " << full_ratio
 	   << ", failsafe_ratio " << failsafe_ratio
 	   << ", new state " << get_full_state_name(new_state)
@@ -793,6 +800,8 @@ bool OSDService::need_fullness_update()
   if (osdmap->exists(whoami)) {
     if (osdmap->get_state(whoami) & CEPH_OSD_FULL) {
       cur = FULL;
+    } else if (osdmap->get_state(whoami) & CEPH_OSD_BACKFILLFULL) {
+      cur = BACKFILLFULL;
     } else if (osdmap->get_state(whoami) & CEPH_OSD_NEARFULL) {
       cur = NEARFULL;
     }
@@ -800,6 +809,8 @@ bool OSDService::need_fullness_update()
   s_names want = NONE;
   if (is_full())
     want = FULL;
+  else if (is_backfillfull())
+    want = BACKFILLFULL;
   else if (is_nearfull())
     want = NEARFULL;
   return want != cur;
@@ -818,15 +829,19 @@ bool OSDService::check_failsafe_full()
     return true;
   }
 
-  if (cur_state == FAILSAFE)
-    return true;
-  return false;
+  return cur_state == FAILSAFE;
 }
 
 bool OSDService::is_nearfull()
 {
   Mutex::Locker l(full_status_lock);
-  return cur_state == NEARFULL;
+  return cur_state >= NEARFULL;
+}
+
+bool OSDService::is_backfillfull()
+{
+  Mutex::Locker l(full_status_lock);
+  return cur_state >= BACKFILLFULL;
 }
 
 bool OSDService::is_full()
@@ -838,18 +853,11 @@ bool OSDService::is_full()
 bool OSDService::too_full_for_backfill(ostream &ss)
 {
   Mutex::Locker l(full_status_lock);
-  if (injectfull) {
-    // injectfull is either a count of the number of times to return full
-    // or if -1 then always return full
-    if (injectfull > 0)
-      --injectfull;
-    ss << "Injected full OSD (" << (injectfull < 0 ? string("set") : std::to_string(injectfull)) << ")";
+  if (cur_state >= BACKFILLFULL) {
+    ss << "current usage is " << cur_ratio << ", which is greater than max allowed ratio";
     return true;
   }
-  double max_ratio;
-  max_ratio = cct->_conf->osd_backfill_full_ratio;
-  ss << "current usage is " << cur_ratio << ", which is greater than max allowed ratio " << max_ratio;
-  return cur_ratio >= max_ratio;
+  return false;
 }
 
 void OSDService::update_osd_stat(vector<int>& hb_peers)
@@ -5213,6 +5221,8 @@ void OSD::send_full_update()
   unsigned state = 0;
   if (service.is_full()) {
     state = CEPH_OSD_FULL;
+  } else if (service.is_backfillfull()) {
+    state = CEPH_OSD_BACKFILLFULL;
   } else if (service.is_nearfull()) {
     state = CEPH_OSD_NEARFULL;
   }
diff --git a/src/osd/OSD.h b/src/osd/OSD.h
index 29bacb60879..19a1ac7a2df 100644
--- a/src/osd/OSD.h
+++ b/src/osd/OSD.h
@@ -1139,11 +1139,12 @@ public:
   // -- OSD Full Status --
 private:
   Mutex full_status_lock;
-  enum s_names { NONE, NEARFULL, FULL, FAILSAFE } cur_state;  // ascending
+  enum s_names { NONE, NEARFULL, BACKFILLFULL, FULL, FAILSAFE } cur_state;  // ascending
   const char *get_full_state_name(s_names s) {
     switch (s) {
     case NONE: return "none";
     case NEARFULL: return "nearfull";
+    case BACKFILLFULL: return "backfillfull";
     case FULL: return "full";
     case FAILSAFE: return "failsafe";
     default: return "???";
@@ -1155,6 +1156,7 @@ private:
 public:
   bool check_failsafe_full();
   bool is_nearfull();
+  bool is_backfillfull();
   bool is_full();
   bool too_full_for_backfill(ostream &ss);
   bool need_fullness_update();  ///< osdmap state needs update
diff --git a/src/osd/OSDMap.cc b/src/osd/OSDMap.cc
index c4e388f86cf..6d0cbfe0a28 100644
--- a/src/osd/OSDMap.cc
+++ b/src/osd/OSDMap.cc
@@ -450,7 +450,7 @@ void OSDMap::Incremental::encode(bufferlist& bl, uint64_t features) const
   }
 
   {
-    uint8_t target_v = 3;
+    uint8_t target_v = 4;
     if (!HAVE_FEATURE(features, SERVER_LUMINOUS)) {
       target_v = 2;
     }
@@ -470,6 +470,7 @@ void OSDMap::Incremental::encode(bufferlist& bl, uint64_t features) const
     if (target_v >= 3) {
       ::encode(new_nearfull_ratio, bl);
       ::encode(new_full_ratio, bl);
+      ::encode(new_backfillfull_ratio, bl);
     }
     ENCODE_FINISH(bl); // osd-only data
   }
@@ -654,7 +655,7 @@ void OSDMap::Incremental::decode(bufferlist::iterator& bl)
   }
 
   {
-    DECODE_START(3, bl); // extended, osd-only data
+    DECODE_START(4, bl); // extended, osd-only data
     ::decode(new_hb_back_up, bl);
     ::decode(new_up_thru, bl);
     ::decode(new_last_clean_interval, bl);
@@ -677,6 +678,11 @@ void OSDMap::Incremental::decode(bufferlist::iterator& bl)
       new_nearfull_ratio = -1;
       new_full_ratio = -1;
     }
+    if (struct_v >= 4) {
+      ::decode(new_backfillfull_ratio, bl);
+    } else {
+      new_backfillfull_ratio = -1;
+    }
     DECODE_FINISH(bl); // osd-only data
   }
 
@@ -720,6 +726,7 @@ void OSDMap::Incremental::dump(Formatter *f) const
   f->dump_int("new_flags", new_flags);
   f->dump_float("new_full_ratio", new_full_ratio);
   f->dump_float("new_nearfull_ratio", new_nearfull_ratio);
+  f->dump_float("new_backfillfull_ratio", new_backfillfull_ratio);
 
   if (fullmap.length()) {
     f->open_object_section("full_map");
@@ -1022,14 +1029,17 @@ int OSDMap::calc_num_osds()
   return num_osd;
 }
 
-void OSDMap::count_full_nearfull_osds(int *full, int *nearfull) const
+void OSDMap::count_full_nearfull_osds(int *full, int *backfill, int *nearfull) const
 {
   *full = 0;
+  *backfill = 0;
   *nearfull = 0;
   for (int i = 0; i < max_osd; ++i) {
     if (exists(i) && is_up(i) && is_in(i)) {
       if (osd_state[i] & CEPH_OSD_FULL)
 	++(*full);
+      else if (osd_state[i] & CEPH_OSD_BACKFILLFULL)
+	++(*backfill);
       else if (osd_state[i] & CEPH_OSD_NEARFULL)
 	++(*nearfull);
     }
@@ -1575,6 +1585,9 @@ int OSDMap::apply_incremental(const Incremental &inc)
   if (inc.new_nearfull_ratio >= 0) {
     nearfull_ratio = inc.new_nearfull_ratio;
   }
+  if (inc.new_backfillfull_ratio >= 0) {
+    backfillfull_ratio = inc.new_backfillfull_ratio;
+  }
   if (inc.new_full_ratio >= 0) {
     full_ratio = inc.new_full_ratio;
   }
@@ -2148,7 +2161,7 @@ void OSDMap::encode(bufferlist& bl, uint64_t features) const
   }
 
   {
-    uint8_t target_v = 2;
+    uint8_t target_v = 3;
     if (!HAVE_FEATURE(features, SERVER_LUMINOUS)) {
       target_v = 1;
     }
@@ -2173,6 +2186,7 @@ void OSDMap::encode(bufferlist& bl, uint64_t features) const
     if (target_v >= 2) {
       ::encode(nearfull_ratio, bl);
       ::encode(full_ratio, bl);
+      ::encode(backfillfull_ratio, bl);
     }
     ENCODE_FINISH(bl); // osd-only data
   }
@@ -2390,7 +2404,7 @@ void OSDMap::decode(bufferlist::iterator& bl)
   }
 
   {
-    DECODE_START(2, bl); // extended, osd-only data
+    DECODE_START(3, bl); // extended, osd-only data
     ::decode(osd_addrs->hb_back_addr, bl);
     ::decode(osd_info, bl);
     ::decode(blacklist, bl);
@@ -2407,6 +2421,11 @@ void OSDMap::decode(bufferlist::iterator& bl)
       nearfull_ratio = 0;
       full_ratio = 0;
     }
+    if (struct_v >= 3) {
+      ::decode(backfillfull_ratio, bl);
+    } else {
+      backfillfull_ratio = 0;
+    }
     DECODE_FINISH(bl); // osd-only data
   }
 
@@ -2480,6 +2499,7 @@ void OSDMap::dump(Formatter *f) const
   f->dump_stream("modified") << get_modified();
   f->dump_string("flags", get_flag_string());
   f->dump_float("full_ratio", full_ratio);
+  f->dump_float("backfillfull_ratio", backfillfull_ratio);
   f->dump_float("nearfull_ratio", nearfull_ratio);
   f->dump_string("cluster_snapshot", get_cluster_snapshot());
   f->dump_int("pool_max", get_pool_max());
@@ -2701,6 +2721,7 @@ void OSDMap::print(ostream& out) const
 
   out << "flags " << get_flag_string() << "\n";
   out << "full_ratio " << full_ratio << "\n";
+  out << "backfillfull_ratio " << backfillfull_ratio << "\n";
   out << "nearfull_ratio " << nearfull_ratio << "\n";
   if (get_cluster_snapshot().length())
     out << "cluster_snapshot " << get_cluster_snapshot() << "\n";
diff --git a/src/osd/OSDMap.h b/src/osd/OSDMap.h
index eb0399edda6..2e8fcf800d9 100644
--- a/src/osd/OSDMap.h
+++ b/src/osd/OSDMap.h
@@ -155,6 +155,7 @@ public:
     string cluster_snapshot;
 
     float new_nearfull_ratio = -1;
+    float new_backfillfull_ratio = -1;
     float new_full_ratio = -1;
 
     mutable bool have_crc;      ///< crc values are defined
@@ -254,7 +255,7 @@ private:
   string cluster_snapshot;
   bool new_blacklist_entries;
 
-  float full_ratio = 0, nearfull_ratio = 0;
+  float full_ratio = 0, backfillfull_ratio = 0, nearfull_ratio = 0;
 
   mutable uint64_t cached_up_osd_features;
 
@@ -336,10 +337,13 @@ public:
   float get_full_ratio() const {
     return full_ratio;
   }
+  float get_backfillfull_ratio() const {
+    return backfillfull_ratio;
+  }
   float get_nearfull_ratio() const {
     return nearfull_ratio;
   }
-  void count_full_nearfull_osds(int *full, int *nearfull) const;
+  void count_full_nearfull_osds(int *full, int *backfill, int *nearfull) const;
 
   /***** cluster state *****/
   /* osds */
diff --git a/src/test/cli/osdmaptool/clobber.t b/src/test/cli/osdmaptool/clobber.t
index 275fefcc737..dd7e1756104 100644
--- a/src/test/cli/osdmaptool/clobber.t
+++ b/src/test/cli/osdmaptool/clobber.t
@@ -20,6 +20,7 @@
   modified \d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}\.\d+ (re)
   flags 
   full_ratio 0
+  backfillfull_ratio 0
   nearfull_ratio 0
   
   pool 0 'rbd' replicated size 3 min_size 2 crush_ruleset 0 object_hash rjenkins pg_num 192 pgp_num 192 last_change 0 flags hashpspool stripe_width 0
@@ -43,6 +44,7 @@
   modified \d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}\.\d+ (re)
   flags 
   full_ratio 0
+  backfillfull_ratio 0
   nearfull_ratio 0
   
   pool 0 'rbd' replicated size 3 min_size 2 crush_ruleset 0 object_hash rjenkins pg_num 64 pgp_num 64 last_change 0 flags hashpspool stripe_width 0
diff --git a/src/test/cli/osdmaptool/create-print.t b/src/test/cli/osdmaptool/create-print.t
index e619f7206e9..32468a4a6fa 100644
--- a/src/test/cli/osdmaptool/create-print.t
+++ b/src/test/cli/osdmaptool/create-print.t
@@ -77,6 +77,7 @@
   modified \d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}\.\d+ (re)
   flags 
   full_ratio 0
+  backfillfull_ratio 0
   nearfull_ratio 0
   
   pool 0 'rbd' replicated size 3 min_size 2 crush_ruleset 0 object_hash rjenkins pg_num 192 pgp_num 192 last_change 0 flags hashpspool stripe_width 0
diff --git a/src/test/cli/osdmaptool/create-racks.t b/src/test/cli/osdmaptool/create-racks.t
index 19006986f68..0759698127d 100644
--- a/src/test/cli/osdmaptool/create-racks.t
+++ b/src/test/cli/osdmaptool/create-racks.t
@@ -790,6 +790,7 @@
   modified \d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}\.\d+ (re)
   flags 
   full_ratio 0
+  backfillfull_ratio 0
   nearfull_ratio 0
   
   pool 0 'rbd' replicated size 3 min_size 2 crush_ruleset 0 object_hash rjenkins pg_num 15296 pgp_num 15296 last_change 0 flags hashpspool stripe_width 0
diff --git a/src/test/pybind/test_ceph_argparse.py b/src/test/pybind/test_ceph_argparse.py
index e9694064bd2..0c9cc7524c5 100755
--- a/src/test/pybind/test_ceph_argparse.py
+++ b/src/test/pybind/test_ceph_argparse.py
@@ -1150,6 +1150,9 @@ class TestOSD(TestArgparse):
     def test_set_full_ratio(self):
         self.set_ratio('set-full-ratio')
 
+    def test_set_backfillfull_ratio(self):
+        self.set_ratio('set-backfillfull-ratio')
+
     def test_set_nearfull_ratio(self):
         self.set_ratio('set-nearfull-ratio')
 
diff --git a/src/tools/ceph_monstore_tool.cc b/src/tools/ceph_monstore_tool.cc
index 874a4f0583f..8c941443d81 100644
--- a/src/tools/ceph_monstore_tool.cc
+++ b/src/tools/ceph_monstore_tool.cc
@@ -654,6 +654,14 @@ static int update_pgmap_meta(MonitorDBStore& st)
     ::encode(full_ratio, bl);
     t->put(prefix, "full_ratio", bl);
   }
+  {
+    auto backfillfull_ratio = g_ceph_context->_conf->mon_osd_backfillfull_ratio;
+    if (backfillfull_ratio > 1.0)
+      backfillfull_ratio /= 100.0;
+    bufferlist bl;
+    ::encode(backfillfull_ratio, bl);
+    t->put(prefix, "backfillfull_ratio", bl);
+  }
   {
     auto nearfull_ratio = g_ceph_context->_conf->mon_osd_nearfull_ratio;
     if (nearfull_ratio > 1.0)

From 1e2fde1012fe9df1ce114256cd72d9da902a01ab Mon Sep 17 00:00:00 2001
From: David Zafman <dzafman@redhat.com>
Date: Fri, 31 Mar 2017 14:13:14 -0700
Subject: [PATCH 13/24] osd: Revamp injectfull op to support all full states

Use check_* for injectable full checks
Use is_* to just test simple cur_state

Signed-off-by: David Zafman <dzafman@redhat.com>
---
 src/osd/OSD.cc          | 101 ++++++++++++++++++++++++++++------------
 src/osd/OSD.h           |  39 ++++++++++++----
 src/osd/PG.cc           |   8 ++--
 src/osd/PrimaryLogPG.cc |   6 ++-
 4 files changed, 110 insertions(+), 44 deletions(-)

diff --git a/src/osd/OSD.cc b/src/osd/OSD.cc
index 1368c9d654e..644c9b959c8 100644
--- a/src/osd/OSD.cc
+++ b/src/osd/OSD.cc
@@ -221,7 +221,6 @@ OSDService::OSDService(OSD *osd) :
   whoami(osd->whoami), store(osd->store),
   log_client(osd->log_client), clog(osd->clog),
   pg_recovery_stats(osd->pg_recovery_stats),
-  injectfull(0),
   cluster_messenger(osd->cluster_messenger),
   client_messenger(osd->client_messenger),
   logger(osd->logger),
@@ -757,11 +756,14 @@ void OSDService::check_full_status(const osd_stat_t &osd_stat)
     nearfull_ratio = failsafe_ratio;
   }
 
-  enum s_names new_state;
-  // If testing with injectfull, let's keep determined state as FAILSAFE
-  if (ratio > failsafe_ratio) {
+  string inject;
+  s_names new_state;
+  if (injectfull_state > NONE && injectfull) {
+    new_state = injectfull_state;
+    inject = "(Injected)";
+  } else if (ratio > failsafe_ratio) {
     new_state = FAILSAFE;
-  } else if (ratio > full_ratio || injectfull) {
+  } else if (ratio > full_ratio) {
     new_state = FULL;
   } else if (ratio > backfillfull_ratio) {
     new_state = BACKFILLFULL;
@@ -776,6 +778,7 @@ void OSDService::check_full_status(const osd_stat_t &osd_stat)
 	   << ", full_ratio " << full_ratio
 	   << ", failsafe_ratio " << failsafe_ratio
 	   << ", new state " << get_full_state_name(new_state)
+	   << " " << inject
 	   << dendl;
 
   // warn
@@ -816,48 +819,73 @@ bool OSDService::need_fullness_update()
   return want != cur;
 }
 
-bool OSDService::check_failsafe_full()
+bool OSDService::_check_full(s_names type, ostream &ss) const
 {
   Mutex::Locker l(full_status_lock);
 
-  if (injectfull) {
-    // injectfull is either a count of the number of times to return full
+  if (injectfull && injectfull_state >= type) {
+    // injectfull is either a count of the number of times to return failsafe full
     // or if -1 then always return full
     if (injectfull > 0)
       --injectfull;
-    dout(5) << __func__ << " Injected full OSD (" << (injectfull < 0 ? "set" : std::to_string(injectfull)) << ")" << dendl;
+    ss << "Injected " << get_full_state_name(type) << " OSD ("
+       << (injectfull < 0 ? "set" : std::to_string(injectfull)) << ")";
     return true;
   }
 
+  ss << "current usage is " << cur_ratio;
+  return cur_state >= type;
+}
+
+bool OSDService::check_failsafe_full(ostream &ss) const
+{
+  return _check_full(FAILSAFE, ss);
+}
+
+bool OSDService::check_full(ostream &ss) const
+{
+  return _check_full(FULL, ss);
+}
+
+bool OSDService::check_backfill_full(ostream &ss) const
+{
+  return _check_full(BACKFILLFULL, ss);
+}
+
+bool OSDService::check_nearfull(ostream &ss) const
+{
+  return _check_full(NEARFULL, ss);
+}
+
+bool OSDService::is_failsafe_full() const
+{
+  Mutex::Locker l(full_status_lock);
   return cur_state == FAILSAFE;
 }
 
-bool OSDService::is_nearfull()
-{
-  Mutex::Locker l(full_status_lock);
-  return cur_state >= NEARFULL;
-}
-
-bool OSDService::is_backfillfull()
-{
-  Mutex::Locker l(full_status_lock);
-  return cur_state >= BACKFILLFULL;
-}
-
-bool OSDService::is_full()
+bool OSDService::is_full() const
 {
   Mutex::Locker l(full_status_lock);
   return cur_state >= FULL;
 }
 
-bool OSDService::too_full_for_backfill(ostream &ss)
+bool OSDService::is_backfillfull() const
 {
   Mutex::Locker l(full_status_lock);
-  if (cur_state >= BACKFILLFULL) {
-    ss << "current usage is " << cur_ratio << ", which is greater than max allowed ratio";
-    return true;
-  }
-  return false;
+  return cur_state >= BACKFILLFULL;
+}
+
+bool OSDService::is_nearfull() const
+{
+  Mutex::Locker l(full_status_lock);
+  return cur_state >= NEARFULL;
+}
+
+void OSDService::set_injectfull(s_names type, int64_t count)
+{
+  Mutex::Locker l(full_status_lock);
+  injectfull_state = type;
+  injectfull = count;
 }
 
 void OSDService::update_osd_stat(vector<int>& hb_peers)
@@ -2660,6 +2688,7 @@ void OSD::final_init()
   r = admin_socket->register_command(
    "injectfull",
    "injectfull " \
+   "name=type,type=CephString,req=false " \
    "name=count,type=CephInt,req=false ",
    test_ops_hook,
    "Inject a full disk (optional count times)");
@@ -4887,7 +4916,21 @@ void TestOpsSocketHook::test_ops(OSDService *service, ObjectStore *store,
     return;
   }
   if (command == "injectfull") {
-    cmd_getval(service->cct, cmdmap, "count", service->injectfull, (int64_t)-1);
+    int64_t count;
+    string type;
+    OSDService::s_names state;
+    cmd_getval(service->cct, cmdmap, "type", type, string("full"));
+    cmd_getval(service->cct, cmdmap, "count", count, (int64_t)-1);
+    if (type == "none" || count == 0) {
+      type = "none";
+      count = 0;
+    }
+    state = service->get_full_state(type);
+    if (state == OSDService::s_names::INVALID) {
+      ss << "Invalid type use (none, nearfull, backfillfull, full, failsafe)";
+      return;
+    }
+    service->set_injectfull(state, count);
     return;
   }
   ss << "Internal error - command=" << command;
diff --git a/src/osd/OSD.h b/src/osd/OSD.h
index 19a1ac7a2df..45f76415f49 100644
--- a/src/osd/OSD.h
+++ b/src/osd/OSD.h
@@ -461,7 +461,6 @@ public:
   LogClient &log_client;
   LogChannelRef clog;
   PGRecoveryStats &pg_recovery_stats;
-  int64_t injectfull;
 private:
   Messenger *&cluster_messenger;
   Messenger *&client_messenger;
@@ -1138,9 +1137,10 @@ public:
 
   // -- OSD Full Status --
 private:
-  Mutex full_status_lock;
-  enum s_names { NONE, NEARFULL, BACKFILLFULL, FULL, FAILSAFE } cur_state;  // ascending
-  const char *get_full_state_name(s_names s) {
+  friend TestOpsSocketHook;
+  mutable Mutex full_status_lock;
+  enum s_names { INVALID = -1, NONE, NEARFULL, BACKFILLFULL, FULL, FAILSAFE } cur_state;  // ascending
+  const char *get_full_state_name(s_names s) const {
     switch (s) {
     case NONE: return "none";
     case NEARFULL: return "nearfull";
@@ -1150,16 +1150,37 @@ private:
     default: return "???";
     }
   }
+  s_names get_full_state(string type) const {
+    if (type == "none")
+      return NONE;
+    else if (type == "failsafe")
+      return FAILSAFE;
+    else if (type == "full")
+      return FULL;
+    else if (type == "backfillfull")
+      return BACKFILLFULL;
+    else if (type == "nearfull")
+      return NEARFULL;
+    else
+      return INVALID;
+  }
   double cur_ratio;  ///< current utilization
+  mutable int64_t injectfull = 0;
+  s_names injectfull_state = NONE;
   float get_failsafe_full_ratio();
   void check_full_status(const osd_stat_t &stat);
+  bool _check_full(s_names type, ostream &ss) const;
 public:
-  bool check_failsafe_full();
-  bool is_nearfull();
-  bool is_backfillfull();
-  bool is_full();
-  bool too_full_for_backfill(ostream &ss);
+  bool check_failsafe_full(ostream &ss) const;
+  bool check_full(ostream &ss) const;
+  bool check_backfill_full(ostream &ss) const;
+  bool check_nearfull(ostream &ss) const;
+  bool is_failsafe_full() const;
+  bool is_full() const;
+  bool is_backfillfull() const;
+  bool is_nearfull() const;
   bool need_fullness_update();  ///< osdmap state needs update
+  void set_injectfull(s_names type, int64_t count);
 
 
   // -- epochs --
diff --git a/src/osd/PG.cc b/src/osd/PG.cc
index 7cee9141820..d8d9c26dd14 100644
--- a/src/osd/PG.cc
+++ b/src/osd/PG.cc
@@ -6561,8 +6561,8 @@ PG::RecoveryState::RepNotRecovering::react(const RequestBackfillPrio &evt)
     ldout(pg->cct, 10) << "backfill reservation rejected: failure injection"
 		       << dendl;
     post_event(RemoteReservationRejected());
-  } else if (pg->osd->too_full_for_backfill(ss) &&
-      !pg->cct->_conf->osd_debug_skip_full_check_in_backfill_reservation) {
+  } else if (!pg->cct->_conf->osd_debug_skip_full_check_in_backfill_reservation &&
+      pg->osd->check_backfill_full(ss)) {
     ldout(pg->cct, 10) << "backfill reservation rejected: "
 		       << ss.str() << dendl;
     post_event(RemoteReservationRejected());
@@ -6597,8 +6597,8 @@ PG::RecoveryState::RepWaitBackfillReserved::react(const RemoteBackfillReserved &
     pg->osd->remote_reserver.cancel_reservation(pg->info.pgid);
     post_event(RemoteReservationRejected());
     return discard_event();
-  } else if (pg->osd->too_full_for_backfill(ss) &&
-	     !pg->cct->_conf->osd_debug_skip_full_check_in_backfill_reservation) {
+  } else if (!pg->cct->_conf->osd_debug_skip_full_check_in_backfill_reservation &&
+	     pg->osd->check_backfill_full(ss)) {
     ldout(pg->cct, 10) << "backfill reservation rejected after reservation: "
 		       << ss.str() << dendl;
     pg->osd->remote_reserver.cancel_reservation(pg->info.pgid);
diff --git a/src/osd/PrimaryLogPG.cc b/src/osd/PrimaryLogPG.cc
index 553ffca191e..5a73ca5e982 100644
--- a/src/osd/PrimaryLogPG.cc
+++ b/src/osd/PrimaryLogPG.cc
@@ -1891,8 +1891,10 @@ void PrimaryLogPG::do_op(OpRequestRef& op)
   // mds should have stopped writing before this point.
   // We can't allow OSD to become non-startable even if mds
   // could be writing as part of file removals.
-  if (write_ordered && osd->check_failsafe_full()) {
+  ostringstream ss;
+  if (write_ordered && osd->check_failsafe_full(ss)) {
     dout(10) << __func__ << " fail-safe full check failed, dropping request"
+             << ss.str()
 	     << dendl;
     return;
   }
@@ -3332,7 +3334,7 @@ void PrimaryLogPG::do_scan(
   case MOSDPGScan::OP_SCAN_GET_DIGEST:
     {
       ostringstream ss;
-      if (osd->too_full_for_backfill(ss)) {
+      if (osd->check_backfill_full(ss)) {
 	dout(1) << __func__ << ": Canceling backfill, " << ss.str() << dendl;
 	queue_peering_event(
 	  CephPeeringEvtRef(

From 1711ccdec779d32c4b128c1e3f6d10bd78abcb1c Mon Sep 17 00:00:00 2001
From: David Zafman <dzafman@redhat.com>
Date: Mon, 3 Apr 2017 16:20:35 -0700
Subject: [PATCH 14/24] osd: Check failsafe full and crash on push/pull

Signed-off-by: David Zafman <dzafman@redhat.com>
---
 src/osd/ECBackend.cc         |  5 +++++
 src/osd/PGBackend.h          |  2 ++
 src/osd/PrimaryLogPG.cc      |  4 ++++
 src/osd/PrimaryLogPG.h       |  1 +
 src/osd/ReplicatedBackend.cc | 12 ++++++++++++
 5 files changed, 24 insertions(+)

diff --git a/src/osd/ECBackend.cc b/src/osd/ECBackend.cc
index 8e7b09fe38a..ea285cc4e6f 100644
--- a/src/osd/ECBackend.cc
+++ b/src/osd/ECBackend.cc
@@ -282,6 +282,11 @@ void ECBackend::handle_recovery_push(
   const PushOp &op,
   RecoveryMessages *m)
 {
+  ostringstream ss;
+  if (get_parent()->check_failsafe_full(ss)) {
+    dout(10) << __func__ << " Out of space (failsafe) processing push request: " << ss.str() << dendl;
+    ceph_abort();
+  }
 
   bool oneshot = op.before_progress.first && op.after_progress.data_complete;
   ghobject_t tobj;
diff --git a/src/osd/PGBackend.h b/src/osd/PGBackend.h
index 763e02bc2b8..ddd0612b131 100644
--- a/src/osd/PGBackend.h
+++ b/src/osd/PGBackend.h
@@ -261,6 +261,8 @@ typedef ceph::shared_ptr<const OSDMap> OSDMapRef;
 
      virtual LogClientTemp clog_error() = 0;
 
+     virtual bool check_failsafe_full(ostream &ss) = 0;
+
      virtual ~Listener() {}
    };
    Listener *parent;
diff --git a/src/osd/PrimaryLogPG.cc b/src/osd/PrimaryLogPG.cc
index 5a73ca5e982..9398783d867 100644
--- a/src/osd/PrimaryLogPG.cc
+++ b/src/osd/PrimaryLogPG.cc
@@ -13272,6 +13272,10 @@ int PrimaryLogPG::getattrs_maybe_cache(
   return r;
 }
 
+bool PrimaryLogPG::check_failsafe_full(ostream &ss) {
+    return osd->check_failsafe_full(ss);
+}
+
 void intrusive_ptr_add_ref(PrimaryLogPG *pg) { pg->get("intptr"); }
 void intrusive_ptr_release(PrimaryLogPG *pg) { pg->put("intptr"); }
 
diff --git a/src/osd/PrimaryLogPG.h b/src/osd/PrimaryLogPG.h
index 3d92e3b968c..f5b025db0e4 100644
--- a/src/osd/PrimaryLogPG.h
+++ b/src/osd/PrimaryLogPG.h
@@ -1731,6 +1731,7 @@ public:
   void on_flushed() override;
   void on_removal(ObjectStore::Transaction *t) override;
   void on_shutdown() override;
+  bool check_failsafe_full(ostream &ss) override;
 
   // attr cache handling
   void setattr_maybe_cache(
diff --git a/src/osd/ReplicatedBackend.cc b/src/osd/ReplicatedBackend.cc
index b897eddf815..1364cc62770 100644
--- a/src/osd/ReplicatedBackend.cc
+++ b/src/osd/ReplicatedBackend.cc
@@ -807,6 +807,11 @@ void ReplicatedBackend::_do_push(OpRequestRef op)
 
   vector<PushReplyOp> replies;
   ObjectStore::Transaction t;
+  ostringstream ss;
+  if (get_parent()->check_failsafe_full(ss)) {
+    dout(10) << __func__ << " Out of space (failsafe) processing push request: " << ss.str() << dendl;
+    ceph_abort();
+  }
   for (vector<PushOp>::const_iterator i = m->pushes.begin();
        i != m->pushes.end();
        ++i) {
@@ -862,6 +867,13 @@ void ReplicatedBackend::_do_pull_response(OpRequestRef op)
   op->mark_started();
 
   vector<PullOp> replies(1);
+
+  ostringstream ss;
+  if (get_parent()->check_failsafe_full(ss)) {
+    dout(10) << __func__ << " Out of space (failsafe) processing pull response (push): " << ss.str() << dendl;
+    ceph_abort();
+  }
+
   ObjectStore::Transaction t;
   list<pull_complete_info> to_continue;
   for (vector<PushOp>::const_iterator i = m->pushes.begin();

From 94e253ce37d6e840b47513c103c871f05111c2c6 Mon Sep 17 00:00:00 2001
From: David Zafman <dzafman@redhat.com>
Date: Thu, 16 Mar 2017 10:30:57 -0700
Subject: [PATCH 15/24] osd: Rename backfill_request_* to recovery_request_*

To be used by both recovery and backfill

Signed-off-by: David Zafman <dzafman@redhat.com>
---
 src/osd/OSD.cc | 10 +++++-----
 src/osd/OSD.h  |  6 +++---
 src/osd/PG.cc  |  4 ++--
 3 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/src/osd/OSD.cc b/src/osd/OSD.cc
index 644c9b959c8..2bc8cce857e 100644
--- a/src/osd/OSD.cc
+++ b/src/osd/OSD.cc
@@ -255,8 +255,8 @@ OSDService::OSDService(OSD *osd) :
   watch_lock("OSDService::watch_lock"),
   watch_timer(osd->client_messenger->cct, watch_lock),
   next_notif_id(0),
-  backfill_request_lock("OSDService::backfill_request_lock"),
-  backfill_request_timer(cct, backfill_request_lock, false),
+  recovery_request_lock("OSDService::recovery_request_lock"),
+  recovery_request_timer(cct, recovery_request_lock, false),
   reserver_finisher(cct),
   local_reserver(&reserver_finisher, cct->_conf->osd_max_backfills,
 		 cct->_conf->osd_min_recovery_priority),
@@ -495,8 +495,8 @@ void OSDService::shutdown()
   objecter_finisher.stop();
 
   {
-    Mutex::Locker l(backfill_request_lock);
-    backfill_request_timer.shutdown();
+    Mutex::Locker l(recovery_request_lock);
+    recovery_request_timer.shutdown();
   }
 
   {
@@ -2200,7 +2200,7 @@ int OSD::init()
 
   tick_timer.init();
   tick_timer_without_osd_lock.init();
-  service.backfill_request_timer.init();
+  service.recovery_request_timer.init();
 
   // mount.
   dout(2) << "mounting " << dev_path << " "
diff --git a/src/osd/OSD.h b/src/osd/OSD.h
index 45f76415f49..591e5d2d2c7 100644
--- a/src/osd/OSD.h
+++ b/src/osd/OSD.h
@@ -917,9 +917,9 @@ public:
     return (((uint64_t)cur_epoch) << 32) | ((uint64_t)(next_notif_id++));
   }
 
-  // -- Backfill Request Scheduling --
-  Mutex backfill_request_lock;
-  SafeTimer backfill_request_timer;
+  // -- Recovery/Backfill Request Scheduling --
+  Mutex recovery_request_lock;
+  SafeTimer recovery_request_timer;
 
   // -- tids --
   // for ops i issue
diff --git a/src/osd/PG.cc b/src/osd/PG.cc
index d8d9c26dd14..e7e21c61302 100644
--- a/src/osd/PG.cc
+++ b/src/osd/PG.cc
@@ -3809,8 +3809,8 @@ void PG::reject_reservation()
 
 void PG::schedule_backfill_full_retry()
 {
-  Mutex::Locker lock(osd->backfill_request_lock);
-  osd->backfill_request_timer.add_event_after(
+  Mutex::Locker lock(osd->recovery_request_lock);
+  osd->recovery_request_timer.add_event_after(
     cct->_conf->osd_backfill_retry_interval,
     new QueuePeeringEvt<RequestBackfill>(
       this, get_osdmap()->get_epoch(),

From c7e8dcad3414651e44c6b543e41f05f05c946d51 Mon Sep 17 00:00:00 2001
From: David Zafman <dzafman@redhat.com>
Date: Thu, 16 Mar 2017 08:05:58 -0700
Subject: [PATCH 16/24] osd: Add check_osdmap_full() to check for shard OSD
 fullness

Signed-off-by: David Zafman <dzafman@redhat.com>
---
 src/osd/OSD.cc          | 10 ++++++++++
 src/osd/OSD.h           |  1 +
 src/osd/PGBackend.h     |  2 ++
 src/osd/PrimaryLogPG.cc |  5 +++++
 src/osd/PrimaryLogPG.h  |  1 +
 5 files changed, 19 insertions(+)

diff --git a/src/osd/OSD.cc b/src/osd/OSD.cc
index 2bc8cce857e..2b2aa97beb0 100644
--- a/src/osd/OSD.cc
+++ b/src/osd/OSD.cc
@@ -921,6 +921,16 @@ void OSDService::update_osd_stat(vector<int>& hb_peers)
   check_full_status(osd_stat);
 }
 
+bool OSDService::check_osdmap_full(const set<pg_shard_t> &missing_on)
+{
+  OSDMapRef osdmap = get_osdmap();
+  for (auto shard : missing_on) {
+    if (osdmap->get_state(shard.osd) & CEPH_OSD_FULL)
+      return true;
+  }
+  return false;
+}
+
 void OSDService::send_message_osd_cluster(int peer, Message *m, epoch_t from_epoch)
 {
   OSDMapRef next_map = get_nextmap_reserved();
diff --git a/src/osd/OSD.h b/src/osd/OSD.h
index 591e5d2d2c7..9429640a9b5 100644
--- a/src/osd/OSD.h
+++ b/src/osd/OSD.h
@@ -1181,6 +1181,7 @@ public:
   bool is_nearfull() const;
   bool need_fullness_update();  ///< osdmap state needs update
   void set_injectfull(s_names type, int64_t count);
+  bool check_osdmap_full(const set<pg_shard_t> &missing_on);
 
 
   // -- epochs --
diff --git a/src/osd/PGBackend.h b/src/osd/PGBackend.h
index ddd0612b131..66bb890af01 100644
--- a/src/osd/PGBackend.h
+++ b/src/osd/PGBackend.h
@@ -263,6 +263,8 @@ typedef ceph::shared_ptr<const OSDMap> OSDMapRef;
 
      virtual bool check_failsafe_full(ostream &ss) = 0;
 
+     virtual bool check_osdmap_full(const set<pg_shard_t> &missing_on) = 0;
+
      virtual ~Listener() {}
    };
    Listener *parent;
diff --git a/src/osd/PrimaryLogPG.cc b/src/osd/PrimaryLogPG.cc
index 9398783d867..71846caabf8 100644
--- a/src/osd/PrimaryLogPG.cc
+++ b/src/osd/PrimaryLogPG.cc
@@ -13031,6 +13031,11 @@ void PrimaryLogPG::_scrub_finish()
   }
 }
 
+bool PrimaryLogPG::check_osdmap_full(const set<pg_shard_t> &missing_on)
+{
+    return osd->check_osdmap_full(missing_on);
+}
+
 /*---SnapTrimmer Logging---*/
 #undef dout_prefix
 #define dout_prefix *_dout << pg->gen_prefix() 
diff --git a/src/osd/PrimaryLogPG.h b/src/osd/PrimaryLogPG.h
index f5b025db0e4..a7e812fe8da 100644
--- a/src/osd/PrimaryLogPG.h
+++ b/src/osd/PrimaryLogPG.h
@@ -1732,6 +1732,7 @@ public:
   void on_removal(ObjectStore::Transaction *t) override;
   void on_shutdown() override;
   bool check_failsafe_full(ostream &ss) override;
+  bool check_osdmap_full(const set<pg_shard_t> &missing_on) override;
 
   // attr cache handling
   void setattr_maybe_cache(

From 27e14504f6c4b7a8e06b038fc396c40e8e6c1456 Mon Sep 17 00:00:00 2001
From: David Zafman <dzafman@redhat.com>
Date: Wed, 5 Apr 2017 14:09:18 -0700
Subject: [PATCH 17/24] osd: Add PG state and flag for too full for recovery

New state machine state NotRecovering
New PG state PG_STATE_RECOVERY_TOOFULL

Signed-off-by: David Zafman <dzafman@redhat.com>
---
 .../osd_internals/recovery_reservation.rst    |  1 +
 src/common/config_opts.h                      |  3 ++
 src/mon/PGMonitor.cc                          |  3 ++
 src/osd/OSD.cc                                |  1 +
 src/osd/OSD.h                                 |  1 +
 src/osd/PG.cc                                 | 31 +++++++++++++++++++
 src/osd/PG.h                                  |  9 ++++++
 src/osd/osd_types.cc                          |  4 +++
 src/osd/osd_types.h                           |  1 +
 9 files changed, 54 insertions(+)

diff --git a/doc/dev/osd_internals/recovery_reservation.rst b/doc/dev/osd_internals/recovery_reservation.rst
index cabea04cc73..4ab03192fe5 100644
--- a/doc/dev/osd_internals/recovery_reservation.rst
+++ b/doc/dev/osd_internals/recovery_reservation.rst
@@ -62,6 +62,7 @@ to the monitor. The state chart can set:
 
  - recovery_wait: waiting for local/remote reservations
  - recovering: recovering
+ - recovery_toofull: recovery stopped, OSD(s) above full ratio
  - backfill_wait: waiting for remote backfill reservations
  - backfilling: backfilling
  - backfill_toofull: backfill stopped, OSD(s) above backfillfull ratio
diff --git a/src/common/config_opts.h b/src/common/config_opts.h
index 9eed4d485e2..c1630a44c01 100644
--- a/src/common/config_opts.h
+++ b/src/common/config_opts.h
@@ -630,6 +630,9 @@ OPTION(osd_min_recovery_priority, OPT_INT, 0)
 // Seconds to wait before retrying refused backfills
 OPTION(osd_backfill_retry_interval, OPT_DOUBLE, 30.0)
 
+// Seconds to wait before retrying refused recovery
+OPTION(osd_recovery_retry_interval, OPT_DOUBLE, 30.0)
+
 // max agent flush ops
 OPTION(osd_agent_max_ops, OPT_INT, 4)
 OPTION(osd_agent_max_low_ops, OPT_INT, 2)
diff --git a/src/mon/PGMonitor.cc b/src/mon/PGMonitor.cc
index 6669ffcd4b3..477dabce4e0 100644
--- a/src/mon/PGMonitor.cc
+++ b/src/mon/PGMonitor.cc
@@ -1316,6 +1316,8 @@ void PGMonitor::get_health(list<pair<health_status_t,string> >& summary,
       note["backfilling"] += p->second;
     if (p->first & PG_STATE_BACKFILL_TOOFULL)
       note["backfill_toofull"] += p->second;
+    if (p->first & PG_STATE_RECOVERY_TOOFULL)
+      note["recovery_toofull"] += p->second;
   }
 
   ceph::unordered_map<pg_t, pg_stat_t> stuck_pgs;
@@ -1403,6 +1405,7 @@ void PGMonitor::get_health(list<pair<health_status_t,string> >& summary,
 	                        PG_STATE_REPAIR |
 	                        PG_STATE_RECOVERING |
 	                        PG_STATE_RECOVERY_WAIT |
+	                        PG_STATE_RECOVERY_TOOFULL |
 	                        PG_STATE_INCOMPLETE |
 	                        PG_STATE_BACKFILL_WAIT |
 	                        PG_STATE_BACKFILL |
diff --git a/src/osd/OSD.cc b/src/osd/OSD.cc
index 2b2aa97beb0..e39330e0923 100644
--- a/src/osd/OSD.cc
+++ b/src/osd/OSD.cc
@@ -2910,6 +2910,7 @@ void OSD::create_recoverystate_perf()
   rs_perf.add_time_avg(rs_down_latency, "down_latency", "Down recovery state latency");
   rs_perf.add_time_avg(rs_getmissing_latency, "getmissing_latency", "Getmissing recovery state latency");
   rs_perf.add_time_avg(rs_waitupthru_latency, "waitupthru_latency", "Waitupthru recovery state latency");
+  rs_perf.add_time_avg(rs_notrecovering_latency, "notrecovering_latency", "Notrecovering recovery state latency");
 
   recoverystate_perf = rs_perf.create_perf_counters();
   cct->get_perfcounters_collection()->add(recoverystate_perf);
diff --git a/src/osd/OSD.h b/src/osd/OSD.h
index 9429640a9b5..f6afebd4df0 100644
--- a/src/osd/OSD.h
+++ b/src/osd/OSD.h
@@ -202,6 +202,7 @@ enum {
   rs_down_latency,
   rs_getmissing_latency,
   rs_waitupthru_latency,
+  rs_notrecovering_latency,
   rs_last,
 };
 
diff --git a/src/osd/PG.cc b/src/osd/PG.cc
index e7e21c61302..576ec836dc5 100644
--- a/src/osd/PG.cc
+++ b/src/osd/PG.cc
@@ -3817,6 +3817,16 @@ void PG::schedule_backfill_full_retry()
       RequestBackfill()));
 }
 
+void PG::schedule_recovery_full_retry()
+{
+  Mutex::Locker lock(osd->recovery_request_lock);
+  osd->recovery_request_timer.add_event_after(
+    cct->_conf->osd_recovery_retry_interval,
+    new QueuePeeringEvt<DoRecovery>(
+      this, get_osdmap()->get_epoch(),
+      DoRecovery()));
+}
+
 void PG::clear_scrub_reserved()
 {
   scrubber.reserved_peers.clear();
@@ -5237,6 +5247,7 @@ void PG::start_peering_interval(
   state_clear(PG_STATE_PEERED);
   state_clear(PG_STATE_DOWN);
   state_clear(PG_STATE_RECOVERY_WAIT);
+  state_clear(PG_STATE_RECOVERY_TOOFULL);
   state_clear(PG_STATE_RECOVERING);
 
   peer_purged.clear();
@@ -6488,6 +6499,24 @@ void PG::RecoveryState::NotBackfilling::exit()
   pg->osd->recoverystate_perf->tinc(rs_notbackfilling_latency, dur);
 }
 
+/*----NotRecovering------*/
+PG::RecoveryState::NotRecovering::NotRecovering(my_context ctx)
+  : my_base(ctx),
+    NamedState(context< RecoveryMachine >().pg->cct, "Started/Primary/Active/NotRecovering")
+{
+  context< RecoveryMachine >().log_enter(state_name);
+  PG *pg = context< RecoveryMachine >().pg;
+  pg->publish_stats_to_osd();
+}
+
+void PG::RecoveryState::NotRecovering::exit()
+{
+  context< RecoveryMachine >().log_exit(state_name, enter_time);
+  PG *pg = context< RecoveryMachine >().pg;
+  utime_t dur = ceph_clock_now() - enter_time;
+  pg->osd->recoverystate_perf->tinc(rs_notrecovering_latency, dur);
+}
+
 /*---RepNotRecovering----*/
 PG::RecoveryState::RepNotRecovering::RepNotRecovering(my_context ctx)
   : my_base(ctx),
@@ -6737,6 +6766,7 @@ PG::RecoveryState::Recovering::Recovering(my_context ctx)
 
   PG *pg = context< RecoveryMachine >().pg;
   pg->state_clear(PG_STATE_RECOVERY_WAIT);
+  pg->state_clear(PG_STATE_RECOVERY_TOOFULL);
   pg->state_set(PG_STATE_RECOVERING);
   pg->publish_stats_to_osd();
   pg->queue_recovery();
@@ -7185,6 +7215,7 @@ void PG::RecoveryState::Active::exit()
   pg->state_clear(PG_STATE_BACKFILL_TOOFULL);
   pg->state_clear(PG_STATE_BACKFILL_WAIT);
   pg->state_clear(PG_STATE_RECOVERY_WAIT);
+  pg->state_clear(PG_STATE_RECOVERY_TOOFULL);
   utime_t dur = ceph_clock_now() - enter_time;
   pg->osd->recoverystate_perf->tinc(rs_active_latency, dur);
   pg->agent_stop();
diff --git a/src/osd/PG.h b/src/osd/PG.h
index 4763859d66b..6ac48fd6cbf 100644
--- a/src/osd/PG.h
+++ b/src/osd/PG.h
@@ -1340,6 +1340,7 @@ public:
 
   void reject_reservation();
   void schedule_backfill_full_retry();
+  void schedule_recovery_full_retry();
 
   // -- recovery state --
 
@@ -1850,6 +1851,14 @@ public:
       boost::statechart::result react(const RemoteReservationRejected& evt);
     };
 
+    struct NotRecovering : boost::statechart::state< NotRecovering, Active>, NamedState {
+      typedef boost::mpl::list<
+	boost::statechart::transition< DoRecovery, WaitLocalRecoveryReserved >
+	> reactions;
+      explicit NotRecovering(my_context ctx);
+      void exit();
+    };
+
     struct RepNotRecovering;
     struct ReplicaActive : boost::statechart::state< ReplicaActive, Started, RepNotRecovering >, NamedState {
       explicit ReplicaActive(my_context ctx);
diff --git a/src/osd/osd_types.cc b/src/osd/osd_types.cc
index 2cce17e029f..145a0d76831 100644
--- a/src/osd/osd_types.cc
+++ b/src/osd/osd_types.cc
@@ -789,6 +789,8 @@ std::string pg_state_string(int state)
     oss << "clean+";
   if (state & PG_STATE_RECOVERY_WAIT)
     oss << "recovery_wait+";
+  if (state & PG_STATE_RECOVERY_TOOFULL)
+    oss << "recovery_toofull+";
   if (state & PG_STATE_RECOVERING)
     oss << "recovering+";
   if (state & PG_STATE_DOWN)
@@ -869,6 +871,8 @@ int pg_string_state(const std::string& state)
     type = PG_STATE_BACKFILL_TOOFULL;
   else if (state == "recovery_wait")
     type = PG_STATE_RECOVERY_WAIT;
+  else if (state == "recovery_toofull")
+    type = PG_STATE_RECOVERY_TOOFULL;
   else if (state == "undersized")
     type = PG_STATE_UNDERSIZED;
   else if (state == "activating")
diff --git a/src/osd/osd_types.h b/src/osd/osd_types.h
index 0d2297582f5..1c4e4c65a6c 100644
--- a/src/osd/osd_types.h
+++ b/src/osd/osd_types.h
@@ -971,6 +971,7 @@ inline ostream& operator<<(ostream& out, const osd_stat_t& s) {
 #define PG_STATE_PEERED        (1<<25) // peered, cannot go active, can recover
 #define PG_STATE_SNAPTRIM      (1<<26) // trimming snaps
 #define PG_STATE_SNAPTRIM_WAIT (1<<27) // queued to trim snaps
+#define PG_STATE_RECOVERY_TOOFULL (1<<28) // recovery can't proceed: too full
 
 std::string pg_state_string(int state);
 std::string pg_vector_string(const vector<int32_t> &a);

From 84088568b5bf4fb3fa48ebf3e157d288f9f29eed Mon Sep 17 00:00:00 2001
From: David Zafman <dzafman@redhat.com>
Date: Wed, 5 Apr 2017 14:12:43 -0700
Subject: [PATCH 18/24] osd: Check whether any OSD is full before starting
 recovery

Add event RecoveryTooFull to move to NotRecovering state

Signed-off-by: David Zafman <dzafman@redhat.com>
---
 src/common/config_opts.h |  1 +
 src/osd/PG.cc            | 18 ++++++++++++++++++
 src/osd/PG.h             |  5 ++++-
 3 files changed, 23 insertions(+), 1 deletion(-)

diff --git a/src/common/config_opts.h b/src/common/config_opts.h
index c1630a44c01..c742ae6fa20 100644
--- a/src/common/config_opts.h
+++ b/src/common/config_opts.h
@@ -871,6 +871,7 @@ OPTION(osd_debug_skip_full_check_in_backfill_reservation, OPT_BOOL, false)
 OPTION(osd_debug_reject_backfill_probability, OPT_DOUBLE, 0)
 OPTION(osd_debug_inject_copyfrom_error, OPT_BOOL, false)  // inject failure during copyfrom completion
 OPTION(osd_debug_misdirected_ops, OPT_BOOL, false)
+OPTION(osd_debug_skip_full_check_in_recovery, OPT_BOOL, false)
 OPTION(osd_enxio_on_misdirected_op, OPT_BOOL, false)
 OPTION(osd_debug_verify_cached_snaps, OPT_BOOL, false)
 OPTION(osd_enable_op_tracker, OPT_BOOL, true) // enable/disable OSD op tracking
diff --git a/src/osd/PG.cc b/src/osd/PG.cc
index 576ec836dc5..c5a35dbaa4a 100644
--- a/src/osd/PG.cc
+++ b/src/osd/PG.cc
@@ -6700,6 +6700,15 @@ PG::RecoveryState::WaitLocalRecoveryReserved::WaitLocalRecoveryReserved(my_conte
 {
   context< RecoveryMachine >().log_enter(state_name);
   PG *pg = context< RecoveryMachine >().pg;
+
+  // Make sure all nodes that part of the recovery aren't full
+  if (!pg->cct->_conf->osd_debug_skip_full_check_in_recovery &&
+      pg->osd->check_osdmap_full(pg->actingbackfill)) {
+    post_event(RecoveryTooFull());
+    return;
+  }
+
+  pg->state_clear(PG_STATE_RECOVERY_TOOFULL);
   pg->state_set(PG_STATE_RECOVERY_WAIT);
   pg->osd->local_reserver.request_reservation(
     pg->info.pgid,
@@ -6710,6 +6719,15 @@ PG::RecoveryState::WaitLocalRecoveryReserved::WaitLocalRecoveryReserved(my_conte
   pg->publish_stats_to_osd();
 }
 
+boost::statechart::result
+PG::RecoveryState::WaitLocalRecoveryReserved::react(const RecoveryTooFull &evt)
+{
+  PG *pg = context< RecoveryMachine >().pg;
+  pg->state_set(PG_STATE_RECOVERY_TOOFULL);
+  pg->schedule_recovery_full_retry();
+  return transit<NotRecovering>();
+}
+
 void PG::RecoveryState::WaitLocalRecoveryReserved::exit()
 {
   context< RecoveryMachine >().log_exit(state_name, enter_time);
diff --git a/src/osd/PG.h b/src/osd/PG.h
index 6ac48fd6cbf..b2168b3ee64 100644
--- a/src/osd/PG.h
+++ b/src/osd/PG.h
@@ -1506,6 +1506,7 @@ public:
   TrivialEvent(RequestRecovery)
   TrivialEvent(RecoveryDone)
   TrivialEvent(BackfillTooFull)
+  TrivialEvent(RecoveryTooFull)
 
   TrivialEvent(AllReplicasRecovered)
   TrivialEvent(DoRecovery)
@@ -1947,10 +1948,12 @@ public:
 
     struct WaitLocalRecoveryReserved : boost::statechart::state< WaitLocalRecoveryReserved, Active >, NamedState {
       typedef boost::mpl::list <
-	boost::statechart::transition< LocalRecoveryReserved, WaitRemoteRecoveryReserved >
+	boost::statechart::transition< LocalRecoveryReserved, WaitRemoteRecoveryReserved >,
+	boost::statechart::custom_reaction< RecoveryTooFull >
 	> reactions;
       explicit WaitLocalRecoveryReserved(my_context ctx);
       void exit();
+      boost::statechart::result react(const RecoveryTooFull &evt);
     };
 
     struct Activating : boost::statechart::state< Activating, Active >, NamedState {

From 1fafec217599b65d902e21efd43049646e1e145c Mon Sep 17 00:00:00 2001
From: David Zafman <dzafman@redhat.com>
Date: Tue, 11 Apr 2017 22:04:07 -0700
Subject: [PATCH 19/24] osd: check_full_status() remove bogus comment and use
 equivalent computation

We actually compute kb_used as the kb - kb_avail.  We don't have the
statfs() system call issue of non-privileged f_bavail vs f_bfree.  It
was assumed that used was really like (blocks - f_bfree).  It is not.

Signed-off-by: David Zafman <dzafman@redhat.com>
---
 src/osd/OSD.cc | 8 +-------
 1 file changed, 1 insertion(+), 7 deletions(-)

diff --git a/src/osd/OSD.cc b/src/osd/OSD.cc
index e39330e0923..c8031826f4e 100644
--- a/src/osd/OSD.cc
+++ b/src/osd/OSD.cc
@@ -716,13 +716,7 @@ void OSDService::check_full_status(const osd_stat_t &osd_stat)
 {
   Mutex::Locker l(full_status_lock);
 
-  // We base ratio on kb_avail rather than kb_used because they can
-  // differ significantly e.g. on btrfs volumes with a large number of
-  // chunks reserved for metadata, and for our purposes (avoiding
-  // completely filling the disk) it's far more important to know how
-  // much space is available to use than how much we've already used.
-  float ratio = ((float)(osd_stat.kb - osd_stat.kb_avail)) /
-    ((float)osd_stat.kb);
+  float ratio = ((float)osd_stat.kb_used) / ((float)osd_stat.kb);
   cur_ratio = ratio;
 
   // The OSDMap ratios take precendence.  So if the failsafe is .95 and

From afd739bed6fb56358c147af32293bff7cc2995a4 Mon Sep 17 00:00:00 2001
From: David Zafman <dzafman@redhat.com>
Date: Thu, 13 Apr 2017 11:41:18 -0700
Subject: [PATCH 20/24] mon: Use currently configure full ratio to determine
 available space

This is a bug that would not adjust available space based on the
currently configured full ratio, but rather the mon_osd_full_ratio
default initial value.

Signed-off-by: David Zafman <dzafman@redhat.com>
---
 src/mon/PGMap.cc | 13 ++++++++++++-
 1 file changed, 12 insertions(+), 1 deletion(-)

diff --git a/src/mon/PGMap.cc b/src/mon/PGMap.cc
index ee263d3f792..68f4879a538 100644
--- a/src/mon/PGMap.cc
+++ b/src/mon/PGMap.cc
@@ -1878,6 +1878,17 @@ int64_t PGMap::get_rule_avail(const OSDMap& osdmap, int ruleno) const
     return 0;
   }
 
+  float fratio;
+  if (osdmap.test_flag(CEPH_OSDMAP_REQUIRE_LUMINOUS) && osdmap.get_full_ratio() > 0) {
+    fratio = osdmap.get_full_ratio();
+  } else if (full_ratio > 0) {
+    fratio = full_ratio;
+  } else {
+    // this shouldn't really happen
+    fratio = g_conf->mon_osd_full_ratio;
+    if (fratio > 1.0) fratio /= 100;
+  }
+
   int64_t min = -1;
   for (map<int,float>::iterator p = wm.begin(); p != wm.end(); ++p) {
     ceph::unordered_map<int32_t,osd_stat_t>::const_iterator osd_info =
@@ -1892,7 +1903,7 @@ int64_t PGMap::get_rule_avail(const OSDMap& osdmap, int ruleno) const
 	continue;
       }
       double unusable = (double)osd_info->second.kb *
-	(1.0 - g_conf->mon_osd_full_ratio);
+	(1.0 - fratio);
       double avail = MAX(0.0, (double)osd_info->second.kb_avail - unusable);
       avail *= 1024.0;
       int64_t proj = (int64_t)(avail / (double)p->second);

From c83f11de002a136890281634bc30715434468475 Mon Sep 17 00:00:00 2001
From: David Zafman <dzafman@redhat.com>
Date: Thu, 13 Apr 2017 11:43:18 -0700
Subject: [PATCH 21/24] mon: Always fix-up full ratios when specified
 incorrectly in config

Signed-off-by: David Zafman <dzafman@redhat.com>
---
 src/mon/OSDMonitor.cc | 21 +++++++++++++++------
 1 file changed, 15 insertions(+), 6 deletions(-)

diff --git a/src/mon/OSDMonitor.cc b/src/mon/OSDMonitor.cc
index 3566c3f7330..de9fb514259 100644
--- a/src/mon/OSDMonitor.cc
+++ b/src/mon/OSDMonitor.cc
@@ -164,8 +164,11 @@ void OSDMonitor::create_initial()
   if (!g_conf->mon_debug_no_require_luminous) {
     newmap.set_flag(CEPH_OSDMAP_REQUIRE_LUMINOUS);
     newmap.full_ratio = g_conf->mon_osd_full_ratio;
+    if (newmap.full_ratio > 1.0) newmap.full_ratio /= 100;
     newmap.backfillfull_ratio = g_conf->mon_osd_backfillfull_ratio;
+    if (newmap.backfillfull_ratio > 1.0) newmap.backfillfull_ratio /= 100;
     newmap.nearfull_ratio = g_conf->mon_osd_nearfull_ratio;
+    if (newmap.nearfull_ratio > 1.0) newmap.nearfull_ratio /= 100;
   }
 
   // encode into pending incremental
@@ -788,9 +791,11 @@ void OSDMonitor::create_pending()
   // On upgrade OSDMap has new field set by mon_osd_backfillfull_ratio config
   // instead of osd_backfill_full_ratio config
   if (osdmap.backfillfull_ratio <= 0) {
-    dout(1) << __func__ << " setting backfillfull_ratio = "
-	    << g_conf->mon_osd_backfillfull_ratio << dendl;
     pending_inc.new_backfillfull_ratio = g_conf->mon_osd_backfillfull_ratio;
+    if (pending_inc.new_backfillfull_ratio > 1.0)
+      pending_inc.new_backfillfull_ratio /= 100;
+    dout(1) << __func__ << " setting backfillfull_ratio = "
+	    << pending_inc.new_backfillfull_ratio << dendl;
   }
   if (!osdmap.test_flag(CEPH_OSDMAP_REQUIRE_LUMINOUS)) {
     // transition full ratios from PGMap to OSDMap (on upgrade)
@@ -808,14 +813,18 @@ void OSDMonitor::create_pending()
   } else {
     // safety check (this shouldn't really happen)
     if (osdmap.full_ratio <= 0) {
-      dout(1) << __func__ << " setting full_ratio = "
-	      << g_conf->mon_osd_full_ratio << dendl;
       pending_inc.new_full_ratio = g_conf->mon_osd_full_ratio;
+      if (pending_inc.new_full_ratio > 1.0)
+        pending_inc.new_full_ratio /= 100;
+      dout(1) << __func__ << " setting full_ratio = "
+	      << pending_inc.new_full_ratio << dendl;
     }
     if (osdmap.nearfull_ratio <= 0) {
-      dout(1) << __func__ << " setting nearfull_ratio = "
-	      << g_conf->mon_osd_nearfull_ratio << dendl;
       pending_inc.new_nearfull_ratio = g_conf->mon_osd_nearfull_ratio;
+      if (pending_inc.new_nearfull_ratio > 1.0)
+        pending_inc.new_nearfull_ratio /= 100;
+      dout(1) << __func__ << " setting nearfull_ratio = "
+	      << pending_inc.new_nearfull_ratio << dendl;
     }
   }
 }

From e4cf10d3d89e3b7adcf2b2dcdc02e020b8a793b3 Mon Sep 17 00:00:00 2001
From: David Zafman <dzafman@redhat.com>
Date: Thu, 13 Apr 2017 16:20:27 -0700
Subject: [PATCH 22/24] mon: Issue warning or error if a full ratio out of
 order

The full ratios should be in this order: nearfull, backfillfull, full, failsafe full

Signed-off-by: David Zafman <dzafman@redhat.com>
---
 src/mon/OSDMonitor.cc | 43 +++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 43 insertions(+)

diff --git a/src/mon/OSDMonitor.cc b/src/mon/OSDMonitor.cc
index de9fb514259..680e9e3a6fe 100644
--- a/src/mon/OSDMonitor.cc
+++ b/src/mon/OSDMonitor.cc
@@ -3359,6 +3359,49 @@ void OSDMonitor::get_health(list<pair<health_status_t,string> >& summary,
     }
 
     if (osdmap.test_flag(CEPH_OSDMAP_REQUIRE_LUMINOUS)) {
+      // An osd could configure failsafe ratio, to something different
+      // but for now assume it is the same here.
+      float fsr = g_conf->osd_failsafe_full_ratio;
+      if (fsr > 1.0) fsr /= 100;
+      float fr = osdmap.get_full_ratio();
+      float br = osdmap.get_backfillfull_ratio();
+      float nr = osdmap.get_nearfull_ratio();
+
+      bool out_of_order = false;
+      // These checks correspond to how OSDService::check_full_status() in an OSD
+      // handles the improper setting of these values.
+      if (br < nr) {
+        out_of_order = true;
+        if (detail) {
+	  ostringstream ss;
+	  ss << "backfill_ratio (" << br << ") < nearfull_ratio (" << nr << "), increased";
+	  detail->push_back(make_pair(HEALTH_ERR, ss.str()));
+        }
+        br = nr;
+      }
+      if (fr < br) {
+        out_of_order = true;
+        if (detail) {
+	  ostringstream ss;
+	  ss << "full_ratio (" << fr << ") < backfillfull_ratio (" << br << "), increased";
+	  detail->push_back(make_pair(HEALTH_ERR, ss.str()));
+        }
+        fr = br;
+      }
+      if (fsr < fr) {
+        out_of_order = true;
+        if (detail) {
+	  ostringstream ss;
+	  ss << "osd_failsafe_full_ratio (" << fsr << ") < full_ratio (" << fr << "), increased";
+	  detail->push_back(make_pair(HEALTH_ERR, ss.str()));
+        }
+      }
+      if (out_of_order) {
+	ostringstream ss;
+	ss << "Full ratio(s) out of order";
+	summary.push_back(make_pair(HEALTH_ERR, ss.str()));
+      }
+
       int full, backfill, nearfull;
       osdmap.count_full_nearfull_osds(&full, &backfill, &nearfull);
       if (full > 0) {

From 252230786557353f9d52cf633b6400097f9daba3 Mon Sep 17 00:00:00 2001
From: David Zafman <dzafman@redhat.com>
Date: Thu, 13 Apr 2017 20:42:55 -0700
Subject: [PATCH 23/24] mon, osd: Add detailed full information for now in the
 mon

Show ceph health doc output in the correct order

Signed-off-by: David Zafman <dzafman@redhat.com>
---
 .../troubleshooting/troubleshooting-osd.rst   |  9 +++--
 src/mon/OSDMonitor.cc                         | 33 +++++++++++++-----
 src/osd/OSDMap.cc                             | 34 +++++++++++++++++++
 src/osd/OSDMap.h                              |  2 ++
 4 files changed, 65 insertions(+), 13 deletions(-)

diff --git a/doc/rados/troubleshooting/troubleshooting-osd.rst b/doc/rados/troubleshooting/troubleshooting-osd.rst
index 651907dfb05..fe29f4767f9 100644
--- a/doc/rados/troubleshooting/troubleshooting-osd.rst
+++ b/doc/rados/troubleshooting/troubleshooting-osd.rst
@@ -222,16 +222,15 @@ lowering the ``mon osd full ratio``, ``mon osd backfillfull ratio``  and
 Full ``ceph-osds`` will be reported by ``ceph health``::
 
 	ceph health
-	HEALTH_WARN 1 nearfull osds
-	osd.2 is near full at 85%
+	HEALTH_WARN 1 nearfull osd(s)
 
 Or::
 
-	ceph health
-	HEALTH_ERR 1 nearfull osds, 1 backfillfull osds, 1 full osds
-	osd.2 is near full at 85%
+	ceph health detail
+	HEALTH_ERR 1 full osd(s); 1 backfillfull osd(s); 1 nearfull osd(s)
 	osd.3 is full at 97%
 	osd.4 is backfill full at 91%
+	osd.2 is near full at 87%
 
 The best way to deal with a full cluster is to add new ``ceph-osds``, allowing
 the cluster to redistribute data to the newly available storage.
diff --git a/src/mon/OSDMonitor.cc b/src/mon/OSDMonitor.cc
index 680e9e3a6fe..0b059271fbb 100644
--- a/src/mon/OSDMonitor.cc
+++ b/src/mon/OSDMonitor.cc
@@ -3402,23 +3402,40 @@ void OSDMonitor::get_health(list<pair<health_status_t,string> >& summary,
 	summary.push_back(make_pair(HEALTH_ERR, ss.str()));
       }
 
-      int full, backfill, nearfull;
-      osdmap.count_full_nearfull_osds(&full, &backfill, &nearfull);
-      if (full > 0) {
+      map<int, float> full, backfillfull, nearfull;
+      osdmap.get_full_osd_util(mon->pgmon()->pg_map.osd_stat, &full, &backfillfull, &nearfull);
+      if (full.size()) {
 	ostringstream ss;
-	ss << full << " full osd(s)";
+	ss << full.size() << " full osd(s)";
 	summary.push_back(make_pair(HEALTH_ERR, ss.str()));
       }
-      if (backfill > 0) {
+      if (backfillfull.size()) {
 	ostringstream ss;
-	ss << backfill << " backfillfull osd(s)";
+	ss << backfillfull.size() << " backfillfull osd(s)";
 	summary.push_back(make_pair(HEALTH_WARN, ss.str()));
       }
-      if (nearfull > 0) {
+      if (nearfull.size()) {
 	ostringstream ss;
-	ss << nearfull << " nearfull osd(s)";
+	ss << nearfull.size() << " nearfull osd(s)";
 	summary.push_back(make_pair(HEALTH_WARN, ss.str()));
       }
+      if (detail) {
+        for (auto& i: full) {
+          ostringstream ss;
+          ss << "osd." << i.first << " is full at " << roundf(i.second * 100) << "%";
+	  detail->push_back(make_pair(HEALTH_ERR, ss.str()));
+        }
+        for (auto& i: backfillfull) {
+          ostringstream ss;
+          ss << "osd." << i.first << " is backfill full at " << roundf(i.second * 100) << "%";
+	  detail->push_back(make_pair(HEALTH_WARN, ss.str()));
+        }
+        for (auto& i: nearfull) {
+          ostringstream ss;
+          ss << "osd." << i.first << " is near full at " << roundf(i.second * 100) << "%";
+	  detail->push_back(make_pair(HEALTH_WARN, ss.str()));
+        }
+      }
     }
     // note: we leave it to ceph-mgr to generate details health warnings
     // with actual osd utilizations
diff --git a/src/osd/OSDMap.cc b/src/osd/OSDMap.cc
index 6d0cbfe0a28..5035fab931a 100644
--- a/src/osd/OSDMap.cc
+++ b/src/osd/OSDMap.cc
@@ -1046,6 +1046,40 @@ void OSDMap::count_full_nearfull_osds(int *full, int *backfill, int *nearfull) c
   }
 }
 
+static bool get_osd_utilization(const ceph::unordered_map<int32_t,osd_stat_t> &osd_stat,
+   int id, int64_t* kb, int64_t* kb_used, int64_t* kb_avail) {
+    auto p = osd_stat.find(id);
+    if (p == osd_stat.end())
+      return false;
+    *kb = p->second.kb;
+    *kb_used = p->second.kb_used;
+    *kb_avail = p->second.kb_avail;
+    return *kb > 0;
+}
+
+void OSDMap::get_full_osd_util(const ceph::unordered_map<int32_t,osd_stat_t> &osd_stat,
+     map<int, float> *full, map<int, float> *backfill, map<int, float> *nearfull) const
+{
+  full->clear();
+  backfill->clear();
+  nearfull->clear();
+  for (int i = 0; i < max_osd; ++i) {
+    if (exists(i) && is_up(i) && is_in(i)) {
+      int64_t kb, kb_used, kb_avail;
+      if (osd_state[i] & CEPH_OSD_FULL) {
+        if (get_osd_utilization(osd_stat, i, &kb, &kb_used, &kb_avail))
+	  full->emplace(i, (float)kb_used / (float)kb);
+      } else if (osd_state[i] & CEPH_OSD_BACKFILLFULL) {
+        if (get_osd_utilization(osd_stat, i, &kb, &kb_used, &kb_avail))
+	  backfill->emplace(i, (float)kb_used / (float)kb);
+      } else if (osd_state[i] & CEPH_OSD_NEARFULL) {
+        if (get_osd_utilization(osd_stat, i, &kb, &kb_used, &kb_avail))
+	  nearfull->emplace(i, (float)kb_used / (float)kb);
+      }
+    }
+  }
+}
+
 void OSDMap::get_all_osds(set<int32_t>& ls) const
 {
   for (int i=0; i<max_osd; i++)
diff --git a/src/osd/OSDMap.h b/src/osd/OSDMap.h
index 2e8fcf800d9..525ba12fb0e 100644
--- a/src/osd/OSDMap.h
+++ b/src/osd/OSDMap.h
@@ -344,6 +344,8 @@ public:
     return nearfull_ratio;
   }
   void count_full_nearfull_osds(int *full, int *backfill, int *nearfull) const;
+  void get_full_osd_util(const ceph::unordered_map<int32_t,osd_stat_t> &osd_stat,
+                         map<int, float> *full, map<int, float> *backfill, map<int, float> *nearfull) const;
 
   /***** cluster state *****/
   /* osds */

From 3becdd313852e55a1b4081011b6096255b7149e4 Mon Sep 17 00:00:00 2001
From: David Zafman <dzafman@redhat.com>
Date: Fri, 14 Apr 2017 17:36:17 -0700
Subject: [PATCH 24/24] test: Test health check output for full ratios

Test out of order ratios summary and details
Test various full osd conditions summary and details

Signed-off-by: David Zafman <dzafman@redhat.com>
---
 qa/workunits/cephtool/test.sh | 33 +++++++++++++++++++++++++++++++++
 1 file changed, 33 insertions(+)

diff --git a/qa/workunits/cephtool/test.sh b/qa/workunits/cephtool/test.sh
index a6c7b617cc6..353e5f74518 100755
--- a/qa/workunits/cephtool/test.sh
+++ b/qa/workunits/cephtool/test.sh
@@ -1424,6 +1424,39 @@ function test_mon_pg()
   ceph osd set-nearfull-ratio .892
   ceph osd dump | grep '^nearfull_ratio 0.892'
 
+  # Check health status
+  ceph osd set-nearfull-ratio .913
+  ceph health | grep 'HEALTH_ERR Full ratio(s) out of order'
+  ceph health detail | grep 'backfill_ratio (0.912) < nearfull_ratio (0.913), increased'
+  ceph osd set-nearfull-ratio .892
+  ceph osd set-backfillfull-ratio .963
+  ceph health detail | grep 'full_ratio (0.962) < backfillfull_ratio (0.963), increased'
+  ceph osd set-backfillfull-ratio .912
+
+  # Check injected full results
+  WAITFORFULL=10
+  ceph --admin-daemon $CEPH_OUT_DIR/osd.0.asok injectfull nearfull
+  sleep $WAITFORFULL
+  ceph health | grep "HEALTH_WARN.*1 nearfull osd(s)"
+  ceph --admin-daemon $CEPH_OUT_DIR/osd.1.asok injectfull backfillfull
+  sleep $WAITFORFULL
+  ceph health | grep "HEALTH_WARN.*1 backfillfull osd(s)"
+  ceph --admin-daemon $CEPH_OUT_DIR/osd.2.asok injectfull failsafe
+  sleep $WAITFORFULL
+  # failsafe and full are the same as far as the monitor is concerned
+  ceph health | grep "HEALTH_ERR.*1 full osd(s)"
+  ceph --admin-daemon $CEPH_OUT_DIR/osd.0.asok injectfull full
+  sleep  $WAITFORFULL
+  ceph health | grep "HEALTH_ERR.*2 full osd(s)"
+  ceph health detail | grep "osd.0 is full at.*%"
+  ceph health detail | grep "osd.2 is full at.*%"
+  ceph health detail | grep "osd.1 is backfill full at.*%"
+  ceph --admin-daemon $CEPH_OUT_DIR/osd.0.asok injectfull none
+  ceph --admin-daemon $CEPH_OUT_DIR/osd.1.asok injectfull none
+  ceph --admin-daemon $CEPH_OUT_DIR/osd.2.asok injectfull none
+  sleep $WAITFORFULL
+  ceph health | grep HEALTH_OK
+
   ceph pg stat | grep 'pgs:'
   ceph pg 0.0 query
   ceph tell 0.0 query