From b79442efce479fde7f8bda8fdadf86e91003a327 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Thu, 20 Dec 2018 08:59:55 -0600 Subject: [PATCH 1/2] osd/PG: align past_intervals and last_epoch_clean for fabricated merge target When we are fabricating a merge target, we have to construct a meaningful pg_history_t and PastIntervals. We do this with pieces of the source PG and the last_epoch_clean and last_epoch_started values from the pg_pool_t. This usually works, except when - source and target become clean, and we decrement pg_num - OSD mapping changes (for source and target) - source repeers, but target does not - OSD with source only tries to merge In this case, the source will have a past_intervals start that is later than the last_epoch_clean implied in the pg_pool_t. This situation is harmless because we do not allow the actual mappings of source and target to diverge during the merge window, so if the source's past intervals was adjusted we can still use it. Avoid logging errors by adjusting the start epoch backwards. Fixes: http://tracker.ceph.com/issues/37511 Signed-off-by: Sage Weil --- src/osd/PG.cc | 11 +++++++++++ src/osd/osd_types.cc | 4 ++++ src/osd/osd_types.h | 6 ++++++ 3 files changed, 21 insertions(+) diff --git a/src/osd/PG.cc b/src/osd/PG.cc index 49345cc2a61..f42a8d04f1d 100644 --- a/src/osd/PG.cc +++ b/src/osd/PG.cc @@ -2765,6 +2765,17 @@ void PG::merge_from(map& sources, RecoveryCtx *rctx, // remapped in concert with each other... info.history = sources.begin()->second->info.history; + // if the past_intervals start is later than last_epoch_clean, it implies + // the source repeered again but the target didn't. avoid the discrepancy + // but adjusting the interval start backwards to match. + auto pib = past_intervals.get_bounds(); + if (info.history.last_epoch_clean < pib.first) { + dout(10) << __func__ << " last_epoch_clean " + << info.history.last_epoch_clean << " < past_interval start " + << pib.first << ", adjusting start backwards" << dendl; + past_intervals.adjust_start_backwards(info.history.last_epoch_clean); + } + // we use the pg_num_dec_last_epoch_clean we got from the caller, which is // the epoch that was clean according to the target pg whe it requested // the mon decrement pg_num. diff --git a/src/osd/osd_types.cc b/src/osd/osd_types.cc index 2b548536800..065fb5459e9 100644 --- a/src/osd/osd_types.cc +++ b/src/osd/osd_types.cc @@ -3403,6 +3403,10 @@ public: pair get_bounds() const override { return make_pair(first, last + 1); } + void adjust_start_backwards(epoch_t last_epoch_clean) { + first = last_epoch_clean; + } + set get_all_participants( bool ec_pool) const override { return all_participants; diff --git a/src/osd/osd_types.h b/src/osd/osd_types.h index 3336db4e926..9ad9353aa1c 100644 --- a/src/osd/osd_types.h +++ b/src/osd/osd_types.h @@ -2935,6 +2935,7 @@ public: ceph_assert(!has_full_intervals()); ceph_abort_msg("not valid for this implementation"); } + virtual void adjust_start_backwards(epoch_t last_epoch_clean) = 0; virtual ~interval_rep() {} }; @@ -3101,6 +3102,11 @@ public: return past_intervals->get_bounds(); } + void adjust_start_backwards(epoch_t last_epoch_clean) { + ceph_assert(past_intervals); + past_intervals->adjust_start_backwards(last_epoch_clean); + } + enum osd_state_t { UP, DOWN, From 5e189036ded094254b0b013fd75ebdca65a02d6e Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Thu, 20 Dec 2018 10:03:10 -0600 Subject: [PATCH 2/2] osd/OSDMap: disallow new upmaps on pgs that are pending merge It's critical that we keep the source and target PGs mapped to the same place. Prevent new pg_upmaps on merge source or target PGs. Signed-off-by: Sage Weil --- src/osd/OSDMap.cc | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/src/osd/OSDMap.cc b/src/osd/OSDMap.cc index 676638fdfbc..30edcfebd1c 100644 --- a/src/osd/OSDMap.cc +++ b/src/osd/OSDMap.cc @@ -1755,8 +1755,16 @@ void OSDMap::maybe_remove_pg_upmaps(CephContext *cct, to_check.insert(p.first); } for (auto& pg : to_check) { - if (!nextmap.pg_exists(pg)) { - ldout(cct, 0) << __func__ << " pg " << pg << " is gone" << dendl; + const pg_pool_t *pi = nextmap.get_pg_pool(pg.pool()); + if (!pi || pg.ps() >= pi->get_pg_num_pending()) { + ldout(cct, 0) << __func__ << " pg " << pg << " is gone or merge source" + << dendl; + to_cancel.insert(pg); + continue; + } + if (pi->is_pending_merge(pg, nullptr)) { + ldout(cct, 0) << __func__ << " pg " << pg << " is pending merge" + << dendl; to_cancel.insert(pg); continue; }