From aeb8fef92469831d94f06db457a4ba15b5b0e3c5 Mon Sep 17 00:00:00 2001 From: Samuel Just Date: Mon, 12 Dec 2016 10:33:13 -0800 Subject: [PATCH 1/3] PG::handle_advance_map: add debugging option to verify cached_removed_snaps Signed-off-by: Samuel Just --- src/common/config_opts.h | 1 + src/osd/PG.cc | 13 +++++++++++++ 2 files changed, 14 insertions(+) diff --git a/src/common/config_opts.h b/src/common/config_opts.h index 9479e07a96d..5a77f037cb8 100644 --- a/src/common/config_opts.h +++ b/src/common/config_opts.h @@ -841,6 +841,7 @@ OPTION(osd_debug_reject_backfill_probability, OPT_DOUBLE, 0) OPTION(osd_debug_inject_copyfrom_error, OPT_BOOL, false) // inject failure during copyfrom completion OPTION(osd_debug_randomize_hobject_sort_order, OPT_BOOL, false) OPTION(osd_debug_misdirected_ops, OPT_BOOL, false) +OPTION(osd_debug_verify_cached_snaps, OPT_BOOL, false) OPTION(osd_enable_op_tracker, OPT_BOOL, true) // enable/disable OSD op tracking OPTION(osd_num_op_tracker_shard, OPT_U32, 32) // The number of shards for holding the ops OPTION(osd_op_history_size, OPT_U32, 20) // Max number of completed ops to track diff --git a/src/osd/PG.cc b/src/osd/PG.cc index b12e099614c..e02b07756de 100644 --- a/src/osd/PG.cc +++ b/src/osd/PG.cc @@ -5694,6 +5694,19 @@ void PG::handle_advance_map( << dendl; update_osdmap_ref(osdmap); pool.update(osdmap); + if (cct->_conf->osd_debug_verify_cached_snaps) { + interval_set actual_removed_snaps; + const pg_pool_t *pi = osdmap->get_pg_pool(info.pgid.pool()); + assert(pi); + pi->build_removed_snaps(actual_removed_snaps); + if (!(actual_removed_snaps == pool.cached_removed_snaps)) { + derr << __func__ << ": mismatch between the actual removed snaps " + << actual_removed_snaps << " and pool.cached_removed_snaps " + << " pool.cached_removed_snaps " << pool.cached_removed_snaps + << dendl; + } + assert(actual_removed_snaps == pool.cached_removed_snaps); + } AdvMap evt( osdmap, lastmap, newup, up_primary, newacting, acting_primary); From d4b6615a49e4635113f9ba900e9c57147b224357 Mon Sep 17 00:00:00 2001 From: Samuel Just Date: Wed, 14 Dec 2016 15:48:59 -0800 Subject: [PATCH 2/3] qa/config/rados.yaml: enable osd_debug_verify_cached_snaps Also, make map gaps more likely. Signed-off-by: Samuel Just --- qa/config/rados.yaml | 1 + qa/suites/rados/thrash/thrashers/mapgap.yaml | 6 +++--- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/qa/config/rados.yaml b/qa/config/rados.yaml index 3e8492f91d3..eb24e5e04d6 100644 --- a/qa/config/rados.yaml +++ b/qa/config/rados.yaml @@ -5,3 +5,4 @@ overrides: osd op queue: debug_random osd op queue cut off: debug_random osd debug verify missing on start: true + osd debug verify cached snaps: true diff --git a/qa/suites/rados/thrash/thrashers/mapgap.yaml b/qa/suites/rados/thrash/thrashers/mapgap.yaml index 5c59340c7eb..0ac5cfc49c6 100644 --- a/qa/suites/rados/thrash/thrashers/mapgap.yaml +++ b/qa/suites/rados/thrash/thrashers/mapgap.yaml @@ -18,6 +18,6 @@ tasks: - osd_map_cache_size - thrashosds: timeout: 1800 - chance_pgnum_grow: 1 - chance_pgpnum_fix: 1 - chance_test_map_discontinuity: 0.5 + chance_pgnum_grow: 0.25 + chance_pgpnum_fix: 0.25 + chance_test_map_discontinuity: 2 From 5642e7e1b3bb6ffceddacd2f4030eb13a17fcccc Mon Sep 17 00:00:00 2001 From: Samuel Just Date: Mon, 12 Dec 2016 10:35:38 -0800 Subject: [PATCH 3/3] PG: fix cached_removed_snaps bug in PGPool::update after map gap 5798fb3bf6d726d14a9c5cb99dc5902eba5b878a actually made 15943 worse by always creating an out-of-date cached_removed_snaps value after a map gap rather than only in the case where the the first map after the gap did not remove any snapshots. Introduced: 5798fb3bf6d726d14a9c5cb99dc5902eba5b878a Fixes: http://tracker.ceph.com/issues/15943 Signed-off-by: Samuel Just --- src/osd/PG.cc | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/src/osd/PG.cc b/src/osd/PG.cc index e02b07756de..797156386e2 100644 --- a/src/osd/PG.cc +++ b/src/osd/PG.cc @@ -164,7 +164,7 @@ void PGPool::update(OSDMapRef map) auid = pi->auid; name = map->get_pool_name(id); bool updated = false; - if ((map->get_epoch() == cached_epoch + 1) && + if ((map->get_epoch() != cached_epoch + 1) || (pi->get_snap_epoch() == map->get_epoch())) { updated = true; pi->build_removed_snaps(newly_removed_snaps); @@ -182,6 +182,16 @@ void PGPool::update(OSDMapRef map) } snapc = pi->get_snap_context(); } else { + /* 1) map->get_epoch() == cached_epoch + 1 && + * 2) pi->get_snap_epoch() != map->get_epoch() + * + * From the if branch, 1 && 2 must be true. From 2, we know that + * this map didn't change the set of removed snaps. From 1, we + * know that our cached_removed_snaps matches the previous map. + * Thus, from 1 && 2, cached_removed snaps matches the current + * set of removed snaps and all we have to do is clear + * newly_removed_snaps. + */ newly_removed_snaps.clear(); } cached_epoch = map->get_epoch();