Merge pull request #19796 from LiumxNL/fix-ooo-caused-con-reset

osd: fix out of order caused by letting old msg from down osd be processed

Reviewed-by: Sage Weil <sage@redhat.com>
This commit is contained in:
Sage Weil 2018-01-10 07:01:04 -06:00 committed by GitHub
commit a7dc224536
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 9 additions and 2 deletions

View File

@ -537,6 +537,12 @@ public:
}
}
}
OSDMapRef get_next_osdmap() {
Mutex::Locker l(pre_publish_lock);
if (!next_osdmap)
return OSDMapRef();
return next_osdmap;
}
private:
Mutex peer_map_epoch_lock;

View File

@ -5737,14 +5737,15 @@ bool PG::can_discard_replica_op(OpRequestRef& op)
// connection to it when handling the new osdmap marking it down, and also
// resets the messenger sesssion when the replica reconnects. to avoid the
// out-of-order replies, the messages from that replica should be discarded.
if (osd->get_osdmap()->is_down(from))
OSDMapRef next_map = osd->get_next_osdmap();
if (next_map->is_down(from))
return true;
/* Mostly, this overlaps with the old_peering_msg
* condition. An important exception is pushes
* sent by replicas not in the acting set, since
* if such a replica goes down it does not cause
* a new interval. */
if (get_osdmap()->get_down_at(from) >= m->map_epoch)
if (next_map->get_down_at(from) >= m->map_epoch)
return true;
// same pg?