From 93ef911bbfcaf2a61b93380d54578e1d796d04f7 Mon Sep 17 00:00:00 2001 From: Samuel Just Date: Tue, 11 Nov 2014 13:28:31 -0800 Subject: [PATCH 1/4] PG: add config to ignore history les in find_best_info Signed-off-by: Samuel Just --- src/common/config_opts.h | 3 +++ src/osd/PG.cc | 3 ++- 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/src/common/config_opts.h b/src/common/config_opts.h index b13eb098260..92065551221 100644 --- a/src/common/config_opts.h +++ b/src/common/config_opts.h @@ -468,6 +468,9 @@ OPTION(osd_agent_min_evict_effort, OPT_FLOAT, .1) OPTION(osd_agent_quantize_effort, OPT_FLOAT, .1) OPTION(osd_agent_delay_time, OPT_FLOAT, 5.0) +// osd ignore history.last_epoch_started in find_best_info +OPTION(osd_find_best_info_ignore_history_les, OPT_BOOL, false) + // decay atime and hist histograms after how many objects go by OPTION(osd_agent_hist_halflife, OPT_INT, 1000) diff --git a/src/osd/PG.cc b/src/osd/PG.cc index c124f7ff859..ee3903aafc8 100644 --- a/src/osd/PG.cc +++ b/src/osd/PG.cc @@ -893,7 +893,8 @@ map::const_iterator PG::find_best_info( for (map::const_iterator i = infos.begin(); i != infos.end(); ++i) { - if (max_last_epoch_started_found < i->second.history.last_epoch_started) { + if (!cct->_conf->osd_find_best_info_ignore_history_les && + max_last_epoch_started_found < i->second.history.last_epoch_started) { min_last_update_acceptable = eversion_t::max(); max_last_epoch_started_found = i->second.history.last_epoch_started; } From 2da958464df4ea59eb5e4784f1dc670f8ac3fdfc Mon Sep 17 00:00:00 2001 From: Samuel Just Date: Wed, 18 Mar 2015 11:19:09 -0700 Subject: [PATCH 2/4] PG: make sure to update history.last_epoch_started with first write Otherwise, we might remember the write, but not that the activation interval was history.last_epoch_started. Signed-off-by: Samuel Just --- src/osd/PG.cc | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/src/osd/PG.cc b/src/osd/PG.cc index ee3903aafc8..cf72a86c41c 100644 --- a/src/osd/PG.cc +++ b/src/osd/PG.cc @@ -2911,6 +2911,15 @@ void PG::append_log( { if (transaction_applied) update_snap_map(logv, t); + + /* The primary has sent an info updating the history, but it may not + * have arrived yet. We want to make sure that we cannot remember this + * write without remembering that it happened in an interval which went + * active in epoch history.last_epoch_started. + */ + if (info.last_epoch_started != info.history.last_epoch_started) { + info.history.last_epoch_started = info.last_epoch_started; + } dout(10) << "append_log " << pg_log.get_log() << " " << logv << dendl; map keys; From 2956ae278daa8744e52e7c69fe5d5416267b84a4 Mon Sep 17 00:00:00 2001 From: Samuel Just Date: Wed, 18 Mar 2015 12:02:04 -0700 Subject: [PATCH 3/4] doc: add last_epoch_started.rst Signed-off-by: Samuel Just --- doc/dev/osd_internals/last_epoch_started.rst | 39 ++++++++++++++++++++ 1 file changed, 39 insertions(+) create mode 100644 doc/dev/osd_internals/last_epoch_started.rst diff --git a/doc/dev/osd_internals/last_epoch_started.rst b/doc/dev/osd_internals/last_epoch_started.rst new file mode 100644 index 00000000000..fcb930f48b6 --- /dev/null +++ b/doc/dev/osd_internals/last_epoch_started.rst @@ -0,0 +1,39 @@ +====================== +last_epoch_started +====================== + +info.last_epoch_started records an activation epoch e for interval i +such that all writes commited in i or earlier are reflected in the +local info/log and no writes after i are reflected in the local +info/log. Since no committed write is ever divergent, even if we +get an authoritative log/info with an older info.last_epoch_started, +we can leave our info.last_epoch_started alone since no writes could +have commited in any intervening interval (See PG::proc_master_log). + +info.history.last_epoch_started records a lower bound on the most +recent interval in which the pg as a whole went active and accepted +writes. On a particular osd, it is also an upper bound on the +activation epoch of intervals in which writes in the local pg log +occurred (we update it before accepting writes). Because all +committed writes are committed by all acting set osds, any +non-divergent writes ensure that history.last_epoch_started was +recorded by all acting set members in the interval. Once peering has +queried one osd from each interval back to some seen +history.last_epoch_started, it follows that no interval after the max +history.last_epoch_started can have reported writes as committed +(since we record it before recording client writes in an interval). +Thus, the minimum last_update across all infos with +info.last_epoch_started >= MAX(history.last_epoch_started) must be an +upper bound on writes reported as committed to the client. + +We update info.last_epoch_started with the intial activation message, +but we only update history.last_epoch_started after the new +info.last_epoch_started is persisted (possibly along with the first +write). This ensures that we do not require an osd with the most +recent info.last_epoch_started until all acting set osds have recorded +it. In find_best_info, we do include info.last_epoch_started values +when calculating the max_last_epoch_started_found because we want to +avoid designating a log entry divergent which in a prior interval +would have been non-divergent. In activate(), we use the peer's +last_epoch_started value as a bound on how far back divergent log +entries can be found. From 0712d8d90b4eb455ae56cce5eafdce3e50de39e0 Mon Sep 17 00:00:00 2001 From: Samuel Just Date: Wed, 18 Mar 2015 12:11:07 -0700 Subject: [PATCH 4/4] PG: ensure that info.last_epoch_started only increases See doc/dev/osd_internals/last_epoch_started.rst Fixes: #11110 Signed-off-by: Samuel Just --- src/osd/PG.cc | 18 ++++++++++++++---- 1 file changed, 14 insertions(+), 4 deletions(-) diff --git a/src/osd/PG.cc b/src/osd/PG.cc index cf72a86c41c..e99fb790af8 100644 --- a/src/osd/PG.cc +++ b/src/osd/PG.cc @@ -280,8 +280,12 @@ void PG::proc_master_log( peer_info[from] = oinfo; dout(10) << " peer osd." << from << " now " << oinfo << " " << omissing << dendl; might_have_unfound.insert(from); - info.last_epoch_started = oinfo.last_epoch_started; + + // See doc/dev/osd_internals/last_epoch_started + if (oinfo.last_epoch_started > info.last_epoch_started) + info.last_epoch_started = oinfo.last_epoch_started; info.history.merge(oinfo.history); + assert(info.last_epoch_started >= info.history.last_epoch_started); peer_missing[from].swap(omissing); } @@ -1480,11 +1484,17 @@ void PG::activate(ObjectStore::Transaction& t, if (is_primary()) { // only update primary last_epoch_started if we will go active - if (acting.size() >= pool.info.min_size) + if (acting.size() >= pool.info.min_size) { + assert(cct->_conf->osd_find_best_info_ignore_history_les || + info.last_epoch_started <= activation_epoch); info.last_epoch_started = activation_epoch; + } } else if (is_acting(pg_whoami)) { - // update last_epoch_started on acting replica to whatever the primary sent - info.last_epoch_started = activation_epoch; + /* update last_epoch_started on acting replica to whatever the primary sent + * unless it's smaller (could happen if we are going peered rather than + * active, see doc/dev/osd_internals/last_epoch_started.rst) */ + if (info.last_epoch_started < activation_epoch) + info.last_epoch_started = activation_epoch; } const pg_missing_t &missing = pg_log.get_missing();