Merge pull request #19796 from LiumxNL/fix-ooo-caused-con-reset

osd: fix out of order caused by letting old msg from down osd be processed Reviewed-by: Sage Weil <sage@redhat.com>
2024-12-18 09:25:49 +00:00 · 2018-01-10 07:01:04 -06:00 · 2018-01-10 07:01:04 -06:00 · a7dc224536
commit a7dc224536
parent 58170f5027 dc279bdd86
2 changed files with 9 additions and 2 deletions
--- a/src/osd/OSD.h
+++ b/src/osd/OSD.h
@ -537,6 +537,12 @@ public:
      }
    }
  }
+  OSDMapRef get_next_osdmap() {
+    Mutex::Locker l(pre_publish_lock);
+    if (!next_osdmap)
+      return OSDMapRef();
+    return next_osdmap;
+  }

 private:
  Mutex peer_map_epoch_lock;
--- a/src/osd/PG.cc
+++ b/src/osd/PG.cc
@ -5737,14 +5737,15 @@ bool PG::can_discard_replica_op(OpRequestRef& op)
  // connection to it when handling the new osdmap marking it down, and also
  // resets the messenger sesssion when the replica reconnects. to avoid the
  // out-of-order replies, the messages from that replica should be discarded.
-  if (osd->get_osdmap()->is_down(from))
+  OSDMapRef next_map = osd->get_next_osdmap();
+  if (next_map->is_down(from))
    return true;
  /* Mostly, this overlaps with the old_peering_msg
   * condition.  An important exception is pushes
   * sent by replicas not in the acting set, since
   * if such a replica goes down it does not cause
   * a new interval. */
-  if (get_osdmap()->get_down_at(from) >= m->map_epoch)
+  if (next_map->get_down_at(from) >= m->map_epoch)
    return true;

  // same pg?