mirror of
https://github.com/ceph/ceph
synced 2025-02-20 17:37:29 +00:00
Merge pull request #4105 from athanatos/wip-11110
Wip 11110 Reviewed-by: Sage Weil <sage@redhat.com>
This commit is contained in:
commit
91a5816702
39
doc/dev/osd_internals/last_epoch_started.rst
Normal file
39
doc/dev/osd_internals/last_epoch_started.rst
Normal file
@ -0,0 +1,39 @@
|
||||
======================
|
||||
last_epoch_started
|
||||
======================
|
||||
|
||||
info.last_epoch_started records an activation epoch e for interval i
|
||||
such that all writes commited in i or earlier are reflected in the
|
||||
local info/log and no writes after i are reflected in the local
|
||||
info/log. Since no committed write is ever divergent, even if we
|
||||
get an authoritative log/info with an older info.last_epoch_started,
|
||||
we can leave our info.last_epoch_started alone since no writes could
|
||||
have commited in any intervening interval (See PG::proc_master_log).
|
||||
|
||||
info.history.last_epoch_started records a lower bound on the most
|
||||
recent interval in which the pg as a whole went active and accepted
|
||||
writes. On a particular osd, it is also an upper bound on the
|
||||
activation epoch of intervals in which writes in the local pg log
|
||||
occurred (we update it before accepting writes). Because all
|
||||
committed writes are committed by all acting set osds, any
|
||||
non-divergent writes ensure that history.last_epoch_started was
|
||||
recorded by all acting set members in the interval. Once peering has
|
||||
queried one osd from each interval back to some seen
|
||||
history.last_epoch_started, it follows that no interval after the max
|
||||
history.last_epoch_started can have reported writes as committed
|
||||
(since we record it before recording client writes in an interval).
|
||||
Thus, the minimum last_update across all infos with
|
||||
info.last_epoch_started >= MAX(history.last_epoch_started) must be an
|
||||
upper bound on writes reported as committed to the client.
|
||||
|
||||
We update info.last_epoch_started with the intial activation message,
|
||||
but we only update history.last_epoch_started after the new
|
||||
info.last_epoch_started is persisted (possibly along with the first
|
||||
write). This ensures that we do not require an osd with the most
|
||||
recent info.last_epoch_started until all acting set osds have recorded
|
||||
it. In find_best_info, we do include info.last_epoch_started values
|
||||
when calculating the max_last_epoch_started_found because we want to
|
||||
avoid designating a log entry divergent which in a prior interval
|
||||
would have been non-divergent. In activate(), we use the peer's
|
||||
last_epoch_started value as a bound on how far back divergent log
|
||||
entries can be found.
|
@ -468,6 +468,9 @@ OPTION(osd_agent_min_evict_effort, OPT_FLOAT, .1)
|
||||
OPTION(osd_agent_quantize_effort, OPT_FLOAT, .1)
|
||||
OPTION(osd_agent_delay_time, OPT_FLOAT, 5.0)
|
||||
|
||||
// osd ignore history.last_epoch_started in find_best_info
|
||||
OPTION(osd_find_best_info_ignore_history_les, OPT_BOOL, false)
|
||||
|
||||
// decay atime and hist histograms after how many objects go by
|
||||
OPTION(osd_agent_hist_halflife, OPT_INT, 1000)
|
||||
|
||||
|
@ -280,8 +280,12 @@ void PG::proc_master_log(
|
||||
peer_info[from] = oinfo;
|
||||
dout(10) << " peer osd." << from << " now " << oinfo << " " << omissing << dendl;
|
||||
might_have_unfound.insert(from);
|
||||
info.last_epoch_started = oinfo.last_epoch_started;
|
||||
|
||||
// See doc/dev/osd_internals/last_epoch_started
|
||||
if (oinfo.last_epoch_started > info.last_epoch_started)
|
||||
info.last_epoch_started = oinfo.last_epoch_started;
|
||||
info.history.merge(oinfo.history);
|
||||
assert(info.last_epoch_started >= info.history.last_epoch_started);
|
||||
|
||||
peer_missing[from].swap(omissing);
|
||||
}
|
||||
@ -893,7 +897,8 @@ map<pg_shard_t, pg_info_t>::const_iterator PG::find_best_info(
|
||||
for (map<pg_shard_t, pg_info_t>::const_iterator i = infos.begin();
|
||||
i != infos.end();
|
||||
++i) {
|
||||
if (max_last_epoch_started_found < i->second.history.last_epoch_started) {
|
||||
if (!cct->_conf->osd_find_best_info_ignore_history_les &&
|
||||
max_last_epoch_started_found < i->second.history.last_epoch_started) {
|
||||
min_last_update_acceptable = eversion_t::max();
|
||||
max_last_epoch_started_found = i->second.history.last_epoch_started;
|
||||
}
|
||||
@ -1479,11 +1484,17 @@ void PG::activate(ObjectStore::Transaction& t,
|
||||
|
||||
if (is_primary()) {
|
||||
// only update primary last_epoch_started if we will go active
|
||||
if (acting.size() >= pool.info.min_size)
|
||||
if (acting.size() >= pool.info.min_size) {
|
||||
assert(cct->_conf->osd_find_best_info_ignore_history_les ||
|
||||
info.last_epoch_started <= activation_epoch);
|
||||
info.last_epoch_started = activation_epoch;
|
||||
}
|
||||
} else if (is_acting(pg_whoami)) {
|
||||
// update last_epoch_started on acting replica to whatever the primary sent
|
||||
info.last_epoch_started = activation_epoch;
|
||||
/* update last_epoch_started on acting replica to whatever the primary sent
|
||||
* unless it's smaller (could happen if we are going peered rather than
|
||||
* active, see doc/dev/osd_internals/last_epoch_started.rst) */
|
||||
if (info.last_epoch_started < activation_epoch)
|
||||
info.last_epoch_started = activation_epoch;
|
||||
}
|
||||
|
||||
const pg_missing_t &missing = pg_log.get_missing();
|
||||
@ -2910,6 +2921,15 @@ void PG::append_log(
|
||||
{
|
||||
if (transaction_applied)
|
||||
update_snap_map(logv, t);
|
||||
|
||||
/* The primary has sent an info updating the history, but it may not
|
||||
* have arrived yet. We want to make sure that we cannot remember this
|
||||
* write without remembering that it happened in an interval which went
|
||||
* active in epoch history.last_epoch_started.
|
||||
*/
|
||||
if (info.last_epoch_started != info.history.last_epoch_started) {
|
||||
info.history.last_epoch_started = info.last_epoch_started;
|
||||
}
|
||||
dout(10) << "append_log " << pg_log.get_log() << " " << logv << dendl;
|
||||
|
||||
map<string,bufferlist> keys;
|
||||
|
Loading…
Reference in New Issue
Block a user