From 930dc603e7b0d0706d067c864be89f195aca4a45 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Sat, 18 Feb 2017 15:31:39 -0500 Subject: [PATCH 1/5] vstart.sh: osd debug misdirected ops = true Signed-off-by: Sage Weil --- src/vstart.sh | 1 + 1 file changed, 1 insertion(+) diff --git a/src/vstart.sh b/src/vstart.sh index e238f8ab3ae..49bbe4c574b 100755 --- a/src/vstart.sh +++ b/src/vstart.sh @@ -578,6 +578,7 @@ $DAEMONOPTS osd class default list = * osd scrub load threshold = 2000.0 osd debug op order = true + osd debug misdirected ops = true filestore wbthrottle xfs ios start flusher = 10 filestore wbthrottle xfs ios hard limit = 20 filestore wbthrottle xfs inodes hard limit = 30 From fce285a5ffbcfbd36c9ac525af953ab4bd7d32c6 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Mon, 20 Feb 2017 07:45:32 -0500 Subject: [PATCH 2/5] osd: warn on ops directed to the wrong pg_t Check whether the request hobj maps to the current pg_t. If we have the osd_debug_misdirected_ops setting enabled (as teuthology does), assert out as well so that the error is easy to spot. This catches bugs in the Objecter (especially the new code that explicitly names the spg_t for the request). Signed-off-by: Sage Weil --- src/osd/PrimaryLogPG.cc | 11 +++++++++++ src/osd/osd_types.h | 3 +++ 2 files changed, 14 insertions(+) diff --git a/src/osd/PrimaryLogPG.cc b/src/osd/PrimaryLogPG.cc index 2ed79b3f3e5..0b52e247327 100644 --- a/src/osd/PrimaryLogPG.cc +++ b/src/osd/PrimaryLogPG.cc @@ -1713,6 +1713,17 @@ void PrimaryLogPG::do_op(OpRequestRef& op) hobject_t head = m->get_hobj(); head.snap = CEPH_NOSNAP; + if (!info.pgid.pgid.contains( + info.pgid.pgid.get_split_bits(pool.info.get_pg_num()), head)) { + derr << __func__ << " " << info.pgid.pgid << " does not contain " + << head << " pg_num " << pool.info.get_pg_num() << " hash " + << std::hex << head.get_hash() << std::dec << dendl; + osd->clog->warn() << info.pgid.pgid << " does not contain " << head + << " op " << *m << "\n"; + assert(!cct->_conf->osd_debug_misdirected_ops); + return; + } + bool can_backoff = m->get_connection()->has_feature(CEPH_FEATURE_RADOS_BACKOFF); SessionRef session; diff --git a/src/osd/osd_types.h b/src/osd/osd_types.h index 671e0ecc262..3be67054e7e 100644 --- a/src/osd/osd_types.h +++ b/src/osd/osd_types.h @@ -402,6 +402,9 @@ struct pg_t { bool contains(int bits, const ghobject_t& oid) { return oid.match(bits, ps()); } + bool contains(int bits, const hobject_t& oid) { + return oid.match(bits, ps()); + } hobject_t get_hobj_start() const; hobject_t get_hobj_end(unsigned pg_num) const; From 57dfcb62c1619b97213b2195244bd0917832e1b3 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Mon, 20 Feb 2017 13:19:39 -0500 Subject: [PATCH 3/5] osdc/Objecter: track latest epoch in op_target_t Signed-off-by: Sage Weil --- src/osdc/Objecter.cc | 4 +++- src/osdc/Objecter.h | 3 +++ 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/src/osdc/Objecter.cc b/src/osdc/Objecter.cc index d83bcf17b50..87dd5aa6fab 100644 --- a/src/osdc/Objecter.cc +++ b/src/osdc/Objecter.cc @@ -2661,7 +2661,9 @@ int Objecter::_calc_target(op_target_t *t, Connection *con, bool any_change) // rwlock is locked bool is_read = t->flags & CEPH_OSD_FLAG_READ; bool is_write = t->flags & CEPH_OSD_FLAG_WRITE; - ldout(cct,20) << __func__ << " base " << t->base_oid << " " << t->base_oloc + t->epoch = osdmap->get_epoch(); + ldout(cct,20) << __func__ << " epoch " << t->epoch + << " base " << t->base_oid << " " << t->base_oloc << " precalc_pgid " << (int)t->precalc_pgid << " pgid " << t->base_pgid << (is_read ? " is_read" : "") diff --git a/src/osdc/Objecter.h b/src/osdc/Objecter.h index fc8dce0304c..bea8d136abe 100644 --- a/src/osdc/Objecter.h +++ b/src/osdc/Objecter.h @@ -1182,6 +1182,9 @@ public: struct op_target_t { int flags = 0; + + epoch_t epoch = 0; ///< latest epoch we calculated the mapping + object_t base_oid; object_locator_t base_oloc; object_t target_oid; From 71f5309fe70592e54a6ff3507faff34032edf955 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Mon, 20 Feb 2017 13:55:10 -0500 Subject: [PATCH 4/5] osdc/Objecter: refactor pool dne check to make op->session optional Signed-off-by: Sage Weil --- src/osdc/Objecter.cc | 35 ++++++++++++++++++++--------------- src/osdc/Objecter.h | 2 +- 2 files changed, 21 insertions(+), 16 deletions(-) diff --git a/src/osdc/Objecter.cc b/src/osdc/Objecter.cc index 87dd5aa6fab..aac9ce6e1cc 100644 --- a/src/osdc/Objecter.cc +++ b/src/osdc/Objecter.cc @@ -1085,7 +1085,7 @@ void Objecter::_scan_requests(OSDSession *s, _op_cancel_map_check(op); break; case RECALC_OP_TARGET_POOL_DNE: - _check_op_pool_dne(op, sl); + _check_op_pool_dne(op, &sl); break; } } @@ -1373,7 +1373,7 @@ void Objecter::C_Op_Map_Latest::finish(int r) op->map_dne_bound = latest; OSDSession::unique_lock sl(op->session->lock, defer_lock); - objecter->_check_op_pool_dne(op, sl); + objecter->_check_op_pool_dne(op, &sl); op->put(); } @@ -1436,7 +1436,7 @@ int Objecter::pool_snap_list(int64_t poolid, vector *snaps) } // sl may be unlocked. -void Objecter::_check_op_pool_dne(Op *op, unique_lock& sl) +void Objecter::_check_op_pool_dne(Op *op, unique_lock *sl) { // rwlock is locked unique @@ -1464,16 +1464,19 @@ void Objecter::_check_op_pool_dne(Op *op, unique_lock& sl) } OSDSession *s = op->session; - assert(s != NULL); - assert(sl.mutex() == &s->lock); - - bool session_locked = sl.owns_lock(); - if (!session_locked) { - sl.lock(); - } - _finish_op(op, 0); - if (!session_locked) { - sl.unlock(); + if (s) { + assert(s != NULL); + assert(sl->mutex() == &s->lock); + bool session_locked = sl->owns_lock(); + if (!session_locked) { + sl->lock(); + } + _finish_op(op, 0); + if (!session_locked) { + sl->unlock(); + } + } else { + _finish_op(op, 0); // no session } } } else { @@ -2989,7 +2992,7 @@ void Objecter::_finish_op(Op *op, int r) { ldout(cct, 15) << "finish_op " << op->tid << dendl; - // op->session->lock is locked unique + // op->session->lock is locked unique or op->session is null if (!op->ctx_budgeted && op->budgeted) put_op_budget(op); @@ -2997,7 +3000,9 @@ void Objecter::_finish_op(Op *op, int r) if (op->ontimeout && r != -ETIMEDOUT) timer.cancel_event(op->ontimeout); - _session_op_remove(op->session, op); + if (op->session) { + _session_op_remove(op->session, op); + } logger->dec(l_osdc_op_active); diff --git a/src/osdc/Objecter.h b/src/osdc/Objecter.h index bea8d136abe..68d15db818b 100644 --- a/src/osdc/Objecter.h +++ b/src/osdc/Objecter.h @@ -1847,7 +1847,7 @@ public: } private: - void _check_op_pool_dne(Op *op, unique_lock& sl); + void _check_op_pool_dne(Op *op, unique_lock *sl); void _send_op_map_check(Op *op); void _op_cancel_map_check(Op *op); void _check_linger_pool_dne(LingerOp *op, bool *need_unregister); From bbd9f0f0b2277c6ba94bd78f828f90563547c360 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Mon, 20 Feb 2017 14:26:42 -0500 Subject: [PATCH 5/5] osdc/Objecter: _calc_target on all ops so that we notice splits We need to make sure we update the mapping and get an accurate actual_pgid value by recalcuating the mapping on every map change. Otherwise, we may not notice a split (and subsequent actual_pgid change) and resend the same op with a stale spg_t. To fix this, - _calc_target on need_resend - update target regardless of current con Signed-off-by: Sage Weil --- src/osdc/Objecter.cc | 51 ++++++++++++++++++++++++++++++++------------ 1 file changed, 37 insertions(+), 14 deletions(-) diff --git a/src/osdc/Objecter.cc b/src/osdc/Objecter.cc index aac9ce6e1cc..69a141329a3 100644 --- a/src/osdc/Objecter.cc +++ b/src/osdc/Objecter.cc @@ -1208,11 +1208,11 @@ void Objecter::handle_osd_map(MOSDMap *m) cluster_full = cluster_full || _osdmap_full_flag(); update_pool_full_map(pool_full_map); + + // check all outstanding requests on every epoch _scan_requests(homeless_session, skipped_map, cluster_full, &pool_full_map, need_resend, need_resend_linger, need_resend_command, sul); - - // osd addr changes? for (map::iterator p = osd_sessions.begin(); p != osd_sessions.end(); ) { OSDSession *s = p->second; @@ -1220,6 +1220,7 @@ void Objecter::handle_osd_map(MOSDMap *m) &pool_full_map, need_resend, need_resend_linger, need_resend_command, sul); ++p; + // osd down or addr change? if (!osdmap->is_up(s->osd) || (s->con && s->con->get_peer_addr() != osdmap->get_inst(s->osd).addr)) { @@ -1255,6 +1256,23 @@ void Objecter::handle_osd_map(MOSDMap *m) } } + // make sure need_resend targets reflect latest map + for (auto p = need_resend.begin(); p != need_resend.end(); ) { + Op *op = p->second; + if (op->target.epoch < osdmap->get_epoch()) { + ldout(cct, 10) << __func__ << " checking op " << p->first << dendl; + int r = _calc_target(&op->target, nullptr); + if (r == RECALC_OP_TARGET_POOL_DNE) { + p = need_resend.erase(p); + _check_op_pool_dne(op, nullptr); + } else { + ++p; + } + } else { + ++p; + } + } + bool pauserd = osdmap->test_flag(CEPH_OSDMAP_PAUSERD); bool pausewr = osdmap->test_flag(CEPH_OSDMAP_PAUSEWR) || _osdmap_full_flag() || _osdmap_has_pool_full(); @@ -2756,20 +2774,23 @@ int Objecter::_calc_target(op_target_t *t, Connection *con, bool any_change) force_resend = true; } - bool need_resend = false; - - bool paused = target_should_be_paused(t); - if (!paused && paused != t->paused) { + bool unpaused = false; + if (t->paused && !target_should_be_paused(t)) { t->paused = false; - need_resend = true; + unpaused = true; } - if (t->pgid != pgid || + + bool legacy_change = + t->pgid != pgid || is_pg_changed( t->acting_primary, t->acting, acting_primary, acting, - t->used_replica || any_change) || - (con && con->has_features(CEPH_FEATUREMASK_RESEND_ON_SPLIT) && - prev_pgid.is_split(t->pg_num, pg_num, nullptr)) || - force_resend) { + t->used_replica || any_change); + bool split = false; + if (t->pg_num) { + split = prev_pgid.is_split(t->pg_num, pg_num, nullptr); + } + + if (legacy_change || split || force_resend) { t->pgid = pgid; t->acting = acting; t->acting_primary = acting_primary; @@ -2829,9 +2850,11 @@ int Objecter::_calc_target(op_target_t *t, Connection *con, bool any_change) } t->osd = osd; } - need_resend = true; } - if (need_resend) { + if (legacy_change || unpaused || force_resend) { + return RECALC_OP_TARGET_NEED_RESEND; + } + if (split && con && con->has_features(CEPH_FEATUREMASK_RESEND_ON_SPLIT)) { return RECALC_OP_TARGET_NEED_RESEND; } return RECALC_OP_TARGET_NO_ACTION;