From dfb78ebf3ea83618d72c7b38d9c07a535543cf8b Mon Sep 17 00:00:00 2001
From: Colin Patrick McCabe <cmccabe@alumni.cmu.edu>
Date: Mon, 15 Nov 2010 16:19:27 -0800
Subject: [PATCH] osd: don't stop recovery when there are unfound

There are two phases in recovery: one where we get all the right objects
on to the primary, and another where we push all those objects out to
the replicas. Formerly, we would not start the second phase until there
were no missing objects at all on the primary.

This change modifies that so that we will start the second phase even if
there are unfound objects. However, we will still wait for all findable
missing objects to be brought to us, of course.

Get rid of uptodate_set. We can find the same information by looking at
the missing and missing_loc sets directly. Keeping the uptodate_set...
er... up-to-date would be very difficult in the presence of all the things
that can modify the missing and missing_loc sets.

Signed-off-by: Colin McCabe <colinm@hq.newdream.net>
---
 src/osd/PG.cc           | 41 +++++++++++++++++-----
 src/osd/PG.h            |  3 +-
 src/osd/ReplicatedPG.cc | 78 ++++++++++++++++++++++++++---------------
 3 files changed, 83 insertions(+), 39 deletions(-)
diff --git a/src/osd/PG.cc b/src/osd/PG.cc
index d23ec191f38..d67afd20c9d 100644
--- a/src/osd/PG.cc
+++ b/src/osd/PG.cc
@@ -765,6 +765,38 @@ ostream& PG::IndexedLog::print(ostream& out) const
 
 
 /******* PG ***********/
+bool PG::is_all_uptodate() const
+{
+  assert(is_primary());
+
+  if (up != acting) {
+    dout(10) << __func__ << ": the set of UP osds is not the same as the "
+	     << "set of ACTING osds." << dendl;
+    return false;
+  }
+
+  size_t ml_sz = missing_loc.size();
+  vector<int>::const_iterator end = acting.end();
+  for (vector<int>::const_iterator a = acting.begin(); a != end; ++a) {
+    int peer = *a;
+    map<int, Missing>::const_iterator pm = peer_missing.find(peer);
+    if (pm == peer_missing.end()) {
+      dout(10) << __func__ << ": osd " << peer << " is not up-to-date. We "
+	       << "have no peer_missing information for this osd." << dendl;
+      return false;
+    }
+    size_t m_sz = pm->second.num_missing();
+    if (m_sz != ml_sz) {
+      dout(10) << __func__ << ": osd " << peer << " is not up-to-date. "
+	       << "num_missing = " << m_sz << ", but missing_loc.size() = "
+	       << ml_sz << dendl;
+      return false;
+    }
+  }
+
+  dout(10) << __func__ << ": all OSDs are uptodate!" << dendl;
+  return true;
+}
 
 void PG::generate_past_intervals()
 {
@@ -1586,11 +1618,6 @@ void PG::activate(ObjectStore::Transaction& t, list<Context*>& tfin,
 
   // if primary..
   if (is_primary()) {
-    // who is clean?
-    uptodate_set.clear();
-    if (info.is_uptodate()) 
-      uptodate_set.insert(osd->whoami);
-    
     // start up replicas
     for (unsigned i=1; i<acting.size(); i++) {
       int peer = acting[i];
@@ -1649,13 +1676,10 @@ void PG::activate(ObjectStore::Transaction& t, list<Context*>& tfin,
       if (pm.num_missing() == 0) {
 	pi.last_complete = pi.last_update;
         dout(10) << "activate peer osd" << peer << " already uptodate, " << pi << dendl;
-	assert(pi.is_uptodate());
-        uptodate_set.insert(peer);
       } else {
         dout(10) << "activate peer osd" << peer << " " << pi
                  << " missing " << pm << dendl;
       }
-            
     }
 
     // discard unneeded peering state
@@ -2836,7 +2860,6 @@ void PG::repair_object(ScrubMap::object *po, int bad_peer, int ok_peer)
 
     log.last_requested = eversion_t();
   }
-  uptodate_set.erase(bad_peer);
   osd->queue_for_recovery(this);
 }
 
diff --git a/src/osd/PG.h b/src/osd/PG.h
index 230026f7249..0715e4b9dd0 100644
--- a/src/osd/PG.h
+++ b/src/osd/PG.h
@@ -181,7 +181,6 @@ public:
     Info() : log_backlog(false) {}
     Info(pg_t p) : pgid(p), log_backlog(false) { }
 
-    bool is_uptodate() const { return last_update == last_complete; }
     bool is_empty() const { return last_update.version == 0; }
     bool dne() const { return history.epoch_created == 0; }
 
@@ -794,7 +793,7 @@ public:
   bool is_prior(int osd) const { return prior_set.count(osd); }
   bool is_stray(int osd) const { return stray_set.count(osd); }
   
-  bool is_all_uptodate() const { return uptodate_set.size() == acting.size() && up == acting; }
+  bool is_all_uptodate() const;
 
   void generate_past_intervals();
   void trim_past_intervals();
diff --git a/src/osd/ReplicatedPG.cc b/src/osd/ReplicatedPG.cc
index 6205b0c1bd0..2be62d607cb 100644
--- a/src/osd/ReplicatedPG.cc
+++ b/src/osd/ReplicatedPG.cc
@@ -3083,8 +3083,6 @@ void ReplicatedPG::sub_op_push_reply(MOSDSubOpReply *reply)
     } else {
       // done!
       peer_missing[peer].got(soid, pi->version);
-      if (peer_missing[peer].num_missing() == 0) 
-	uptodate_set.insert(peer);
       
       pushing[soid].erase(peer);
       pi = NULL;
@@ -3486,9 +3484,6 @@ void ReplicatedPG::sub_op_push(MOSDSubOp *op)
       finish_recovery_op(soid);
       
       update_stats();
-
-      if (info.is_uptodate())
-	uptodate_set.insert(osd->get_nodeid());
     } else {
       // pull more
       pi->data_subset_pulling.span_of(pi->data_subset, data_subset.range_end(), g_conf.osd_recovery_max_chunk);
@@ -3631,10 +3626,20 @@ int ReplicatedPG::start_recovery_ops(int max)
   
   while (max > 0) {
     int n;
-    if (uptodate_set.count(osd->whoami))
+
+    size_t ml_sz = missing_loc.size();
+    size_t m_sz = missing.missing.size();
+
+    assert(m_sz >= ml_sz);
+
+    if (m_sz == ml_sz) {
+      // All of the missing objects we have are unfound.
       n = recover_replicas(max);
-    else
+    }
+    else {
+      // We still have missing objects that we should grab from replicas.
       n = recover_primary(max);
+    }
     started += n;
     osd->logger->inc(l_osd_rop, n);
     if (n < max)
@@ -3762,7 +3767,6 @@ int ReplicatedPG::recover_primary(int max)
 
   log.reset_recovery_pointers();
 
-  uptodate_set.insert(osd->whoami);
   if (is_all_uptodate()) {
     dout(-7) << "recover_primary complete" << dendl;
     ObjectStore::Transaction *t = new ObjectStore::Transaction;
@@ -3809,43 +3813,61 @@ int ReplicatedPG::recover_object_replicas(const sobject_t& soid)
 
 int ReplicatedPG::recover_replicas(int max)
 {
+  dout(10) << __func__ << "(" << max << ")" << dendl;
+  bool all_uptodate = true;
   int started = 0;
-  dout(-10) << "recover_replicas" << dendl;
+
+  // Is the primary uptodate?
+  size_t ml_size = missing_loc.size();
+  if (missing.num_missing() > ml_size)
+    all_uptodate = false;
 
   // this is FAR from an optimal recovery order.  pretty lame, really.
   for (unsigned i=1; i<acting.size(); i++) {
     int peer = acting[i];
-    assert(peer_missing.count(peer));
+    map<int, Missing>::const_iterator pm = peer_missing.find(peer);
+    assert(pm != peer_missing.end());
+    size_t m_sz = pm->second.num_missing();
 
-    dout(10) << " peer osd" << peer << " missing " << peer_missing[peer] << dendl;
-    dout(20) << "   " << peer_missing[peer].missing << dendl;
+    dout(10) << " peer osd" << peer << " missing " << m_sz << " objects." << dendl;
+    dout(20) << " peer osd" << peer << " missing " << pm->second.missing << dendl;
 
     // oldest first!
-    Missing &m = peer_missing[peer];
-    for (map<eversion_t, sobject_t>::iterator p = m.rmissing.begin();
+    const Missing &m(pm->second);
+    for (map<eversion_t, sobject_t>::const_iterator p = m.rmissing.begin();
 	   p != m.rmissing.end() && started < max;
-	   p++) {
-      sobject_t soid = p->second;
-      if (pushing.count(soid))
-	dout(10) << " already pushing " << soid << dendl;
-      else if (missing.is_missing(soid))
-	dout(10) << " still missing on primary " << soid << dendl;
-      else
-	started += recover_object_replicas(soid);
+	   ++p) {
+      const sobject_t soid(p->second);
+      if (pushing.count(soid)) {
+	dout(10) << __func__ << ": already pushing " << soid << dendl;
+	all_uptodate = false;
+	continue;
+      }
+      if (missing_loc.find(soid) == missing_loc.end()) {
+	dout(10) << __func__ << ": " << soid << " is still unfound." << dendl;
+	continue;
+      }
+      if (missing.is_missing(soid)) {
+	dout(10) << __func__ << ": still missing on primary " << soid << dendl;
+	all_uptodate = false;
+	continue;
+      }
+      dout(10) << __func__ << ": recover_object_replicas(" << soid << ")" << dendl;
+      started += recover_object_replicas(soid);
+      all_uptodate = false;
     }
   }
   
-  // nothing to do!
-  dout(-10) << "recover_replicas - nothing to do!" << dendl;
-
-  if (is_all_uptodate()) {
+  if (all_uptodate) {
+    dout(10) << __func__ << ": all replicas are up-to-date!" << dendl;
     ObjectStore::Transaction *t = new ObjectStore::Transaction;
     C_Contexts *fin = new C_Contexts;
     finish_recovery(*t, fin->contexts);
     int tr = osd->store->queue_transaction(&osr, t, new ObjectStore::C_DeleteTransaction(t), fin);
     assert(tr == 0);
-  } else {
-    dout(10) << "recover_replicas not all uptodate, acting " << acting << ", uptodate " << uptodate_set << dendl;
+  }
+  else {
+    dout(20) << __func__ << ": some replicas are not up-to-date." << dendl;
   }
 
   return started;