Merge pull request #4105 from athanatos/wip-11110

Wip 11110

Reviewed-by: Sage Weil <sage@redhat.com>
This commit is contained in:
Samuel Just 2015-03-19 09:41:58 -07:00
commit 91a5816702
3 changed files with 67 additions and 5 deletions

View File

@ -0,0 +1,39 @@
======================
last_epoch_started
======================
info.last_epoch_started records an activation epoch e for interval i
such that all writes commited in i or earlier are reflected in the
local info/log and no writes after i are reflected in the local
info/log. Since no committed write is ever divergent, even if we
get an authoritative log/info with an older info.last_epoch_started,
we can leave our info.last_epoch_started alone since no writes could
have commited in any intervening interval (See PG::proc_master_log).
info.history.last_epoch_started records a lower bound on the most
recent interval in which the pg as a whole went active and accepted
writes. On a particular osd, it is also an upper bound on the
activation epoch of intervals in which writes in the local pg log
occurred (we update it before accepting writes). Because all
committed writes are committed by all acting set osds, any
non-divergent writes ensure that history.last_epoch_started was
recorded by all acting set members in the interval. Once peering has
queried one osd from each interval back to some seen
history.last_epoch_started, it follows that no interval after the max
history.last_epoch_started can have reported writes as committed
(since we record it before recording client writes in an interval).
Thus, the minimum last_update across all infos with
info.last_epoch_started >= MAX(history.last_epoch_started) must be an
upper bound on writes reported as committed to the client.
We update info.last_epoch_started with the intial activation message,
but we only update history.last_epoch_started after the new
info.last_epoch_started is persisted (possibly along with the first
write). This ensures that we do not require an osd with the most
recent info.last_epoch_started until all acting set osds have recorded
it. In find_best_info, we do include info.last_epoch_started values
when calculating the max_last_epoch_started_found because we want to
avoid designating a log entry divergent which in a prior interval
would have been non-divergent. In activate(), we use the peer's
last_epoch_started value as a bound on how far back divergent log
entries can be found.

View File

@ -468,6 +468,9 @@ OPTION(osd_agent_min_evict_effort, OPT_FLOAT, .1)
OPTION(osd_agent_quantize_effort, OPT_FLOAT, .1)
OPTION(osd_agent_delay_time, OPT_FLOAT, 5.0)
// osd ignore history.last_epoch_started in find_best_info
OPTION(osd_find_best_info_ignore_history_les, OPT_BOOL, false)
// decay atime and hist histograms after how many objects go by
OPTION(osd_agent_hist_halflife, OPT_INT, 1000)

View File

@ -280,8 +280,12 @@ void PG::proc_master_log(
peer_info[from] = oinfo;
dout(10) << " peer osd." << from << " now " << oinfo << " " << omissing << dendl;
might_have_unfound.insert(from);
info.last_epoch_started = oinfo.last_epoch_started;
// See doc/dev/osd_internals/last_epoch_started
if (oinfo.last_epoch_started > info.last_epoch_started)
info.last_epoch_started = oinfo.last_epoch_started;
info.history.merge(oinfo.history);
assert(info.last_epoch_started >= info.history.last_epoch_started);
peer_missing[from].swap(omissing);
}
@ -893,7 +897,8 @@ map<pg_shard_t, pg_info_t>::const_iterator PG::find_best_info(
for (map<pg_shard_t, pg_info_t>::const_iterator i = infos.begin();
i != infos.end();
++i) {
if (max_last_epoch_started_found < i->second.history.last_epoch_started) {
if (!cct->_conf->osd_find_best_info_ignore_history_les &&
max_last_epoch_started_found < i->second.history.last_epoch_started) {
min_last_update_acceptable = eversion_t::max();
max_last_epoch_started_found = i->second.history.last_epoch_started;
}
@ -1479,11 +1484,17 @@ void PG::activate(ObjectStore::Transaction& t,
if (is_primary()) {
// only update primary last_epoch_started if we will go active
if (acting.size() >= pool.info.min_size)
if (acting.size() >= pool.info.min_size) {
assert(cct->_conf->osd_find_best_info_ignore_history_les ||
info.last_epoch_started <= activation_epoch);
info.last_epoch_started = activation_epoch;
}
} else if (is_acting(pg_whoami)) {
// update last_epoch_started on acting replica to whatever the primary sent
info.last_epoch_started = activation_epoch;
/* update last_epoch_started on acting replica to whatever the primary sent
* unless it's smaller (could happen if we are going peered rather than
* active, see doc/dev/osd_internals/last_epoch_started.rst) */
if (info.last_epoch_started < activation_epoch)
info.last_epoch_started = activation_epoch;
}
const pg_missing_t &missing = pg_log.get_missing();
@ -2910,6 +2921,15 @@ void PG::append_log(
{
if (transaction_applied)
update_snap_map(logv, t);
/* The primary has sent an info updating the history, but it may not
* have arrived yet. We want to make sure that we cannot remember this
* write without remembering that it happened in an interval which went
* active in epoch history.last_epoch_started.
*/
if (info.last_epoch_started != info.history.last_epoch_started) {
info.history.last_epoch_started = info.last_epoch_started;
}
dout(10) << "append_log " << pg_log.get_log() << " " << logv << dendl;
map<string,bufferlist> keys;