Merge pull request #23663 from xiexingguo/wip-incompat-async-fixes

osd: some recovery improvements and cleanups


Reviewed-by: Sage Weil <sage@redhat.com>
This commit is contained in:
Xie Xingguo 2018-09-01 14:27:27 +08:00 committed by GitHub
commit 0857124d23
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
6 changed files with 98 additions and 59 deletions

View File

@ -26,6 +26,8 @@ function run() {
export CEPH_ARGS
CEPH_ARGS+="--fsid=$(uuidgen) --auth-supported=none "
CEPH_ARGS+="--mon-host=$CEPH_MON "
# so we will not force auth_log_shard to be acting_primary
CEPH_ARGS+="--osd_force_auth_primary_missing_objects=1000000 "
export margin=10
export objects=200
export poolname=test

View File

@ -3217,6 +3217,10 @@ std::vector<Option> get_global_options() {
.set_default(100)
.set_description(""),
Option("osd_force_auth_primary_missing_objects", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
.set_default(100)
.set_description("Approximate missing objects above which to force auth_log_shard to be primary temporarily"),
Option("osd_async_recovery_min_pg_log_entries", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
.set_default(100)
.set_description("Number of entries difference above which to use asynchronous recovery when appropriate"),

View File

@ -1208,6 +1208,22 @@ map<pg_shard_t, pg_info_t>::const_iterator PG::find_best_info(
continue;
}
if (!p->second.has_missing() && best->second.has_missing()) {
dout(10) << __func__ << " prefer osd." << p->first
<< " because it is complete while best has missing"
<< dendl;
best = p;
continue;
} else if (p->second.has_missing() && !best->second.has_missing()) {
dout(10) << __func__ << " skipping osd." << p->first
<< " because it has missing while best is complete"
<< dendl;
continue;
} else {
// both are complete or have missing
// fall through
}
// prefer current primary (usually the caller), all things being equal
if (p->first == pg_whoami) {
dout(10) << "calc_acting prefer osd." << p->first
@ -1296,6 +1312,7 @@ void PG::calc_ec_acting(
*/
void PG::calc_replicated_acting(
map<pg_shard_t, pg_info_t>::const_iterator auth_log_shard,
uint64_t force_auth_primary_missing_objects,
unsigned size,
const vector<int> &acting,
const vector<int> &up,
@ -1305,6 +1322,7 @@ void PG::calc_replicated_acting(
vector<int> *want,
set<pg_shard_t> *backfill,
set<pg_shard_t> *acting_backfill,
const OSDMapRef osdmap,
ostream &ss)
{
pg_shard_t auth_log_shard_id = auth_log_shard->first;
@ -1314,12 +1332,37 @@ void PG::calc_replicated_acting(
<< (restrict_to_up_acting ? " restrict_to_up_acting" : "") << std::endl;
// select primary
map<pg_shard_t,pg_info_t>::const_iterator primary = all_info.find(up_primary);
auto primary = all_info.find(up_primary);
if (up.size() &&
!primary->second.is_incomplete() &&
primary->second.last_update >=
auth_log_shard->second.log_tail) {
ss << "up_primary: " << up_primary << ") selected as primary" << std::endl;
if (HAVE_FEATURE(osdmap->get_up_osd_features(), SERVER_NAUTILUS)) {
auto approx_missing_objects =
primary->second.stats.stats.sum.num_objects_missing;
auto auth_version = auth_log_shard->second.last_update.version;
auto primary_version = primary->second.last_update.version;
if (auth_version > primary_version) {
approx_missing_objects += auth_version - primary_version;
} else {
approx_missing_objects += primary_version - auth_version;
}
if ((uint64_t)approx_missing_objects >
force_auth_primary_missing_objects) {
primary = auth_log_shard;
ss << "up_primary: " << up_primary << ") has approximate "
<< approx_missing_objects
<< "(>" << force_auth_primary_missing_objects <<") "
<< "missing objects, osd." << auth_log_shard_id
<< " selected as primary instead"
<< std::endl;
} else {
ss << "up_primary: " << up_primary << ") selected as primary"
<< std::endl;
}
} else {
ss << "up_primary: " << up_primary << ") selected as primary" << std::endl;
}
} else {
ceph_assert(!auth_log_shard->second.is_incomplete());
ss << "up[0] needs backfill, osd." << auth_log_shard_id
@ -1331,52 +1374,47 @@ void PG::calc_replicated_acting(
<< " with " << primary->second << std::endl;
want->push_back(primary->first.osd);
acting_backfill->insert(primary->first);
unsigned usable = 1;
/* We include auth_log_shard->second.log_tail because in GetLog,
* we will request logs back to the min last_update over our
* acting_backfill set, which will result in our log being extended
* as far backwards as necessary to pick up any peers which can
* be log recovered by auth_log_shard's log */
eversion_t oldest_auth_log_entry =
std::min(primary->second.log_tail, auth_log_shard->second.log_tail);
// select replicas that have log contiguity with primary.
// prefer up, then acting, then any peer_info osds
eversion_t oldest_auth_log_entry =
std::min(primary->second.log_tail, auth_log_shard->second.log_tail);
for (vector<int>::const_iterator i = up.begin();
i != up.end();
++i) {
pg_shard_t up_cand = pg_shard_t(*i, shard_id_t::NO_SHARD);
for (auto i : up) {
pg_shard_t up_cand = pg_shard_t(i, shard_id_t::NO_SHARD);
if (up_cand == primary->first)
continue;
const pg_info_t &cur_info = all_info.find(up_cand)->second;
if (cur_info.is_incomplete() ||
cur_info.last_update < oldest_auth_log_entry) {
/* We include auth_log_shard->second.log_tail because in GetLog,
* we will request logs back to the min last_update over our
* acting_backfill set, which will result in our log being extended
* as far backwards as necessary to pick up any peers which can
* be log recovered by auth_log_shard's log */
ss << " shard " << up_cand << " (up) backfill " << cur_info << std::endl;
backfill->insert(up_cand);
acting_backfill->insert(up_cand);
} else {
want->push_back(*i);
want->push_back(i);
acting_backfill->insert(up_cand);
usable++;
ss << " osd." << *i << " (up) accepted " << cur_info << std::endl;
ss << " osd." << i << " (up) accepted " << cur_info << std::endl;
}
}
if (usable >= size) {
if (want->size() >= size) {
return;
}
std::vector<std::pair<eversion_t, int>> candidate_by_last_update;
candidate_by_last_update.reserve(acting.size());
// This no longer has backfill OSDs, but they are covered above.
for (vector<int>::const_iterator i = acting.begin();
i != acting.end();
++i) {
pg_shard_t acting_cand(*i, shard_id_t::NO_SHARD);
for (auto i : acting) {
pg_shard_t acting_cand(i, shard_id_t::NO_SHARD);
// skip up osds we already considered above
if (acting_cand == primary->first)
continue;
vector<int>::const_iterator up_it = find(up.begin(), up.end(), *i);
vector<int>::const_iterator up_it = find(up.begin(), up.end(), i);
if (up_it != up.end())
continue;
@ -1386,27 +1424,25 @@ void PG::calc_replicated_acting(
ss << " shard " << acting_cand << " (acting) REJECTED "
<< cur_info << std::endl;
} else {
candidate_by_last_update.push_back(make_pair(cur_info.last_update, *i));
candidate_by_last_update.push_back(make_pair(cur_info.last_update, i));
}
}
auto sort_by_eversion =[](const std::pair<eversion_t, int> &lhs,
const std::pair<eversion_t, int> &rhs) {
return lhs.first > rhs.first;
};
// sort by last_update, in descending order.
std::sort(candidate_by_last_update.begin(), candidate_by_last_update.end(),
[](const std::pair<eversion_t, int> &lhs,
const std::pair<eversion_t, int> &rhs) {
return lhs.first > rhs.first;
}
);
std::sort(candidate_by_last_update.begin(),
candidate_by_last_update.end(), sort_by_eversion);
for (auto &p: candidate_by_last_update) {
ceph_assert(usable < size);
ceph_assert(want->size() < size);
want->push_back(p.second);
pg_shard_t s = pg_shard_t(p.second, shard_id_t::NO_SHARD);
acting_backfill->insert(s);
ss << " shard " << s << " (acting) accepted "
<< all_info.find(s)->second << std::endl;
usable++;
if (usable >= size) {
if (want->size() >= size) {
return;
}
}
@ -1416,27 +1452,26 @@ void PG::calc_replicated_acting(
}
candidate_by_last_update.clear();
candidate_by_last_update.reserve(all_info.size()); // overestimate but fine
for (map<pg_shard_t,pg_info_t>::const_iterator i = all_info.begin();
i != all_info.end();
++i) {
// continue to search stray to find more suitable peers
for (auto &i : all_info) {
// skip up osds we already considered above
if (i->first == primary->first)
if (i.first == primary->first)
continue;
vector<int>::const_iterator up_it = find(up.begin(), up.end(), i->first.osd);
vector<int>::const_iterator up_it = find(up.begin(), up.end(), i.first.osd);
if (up_it != up.end())
continue;
vector<int>::const_iterator acting_it = find(
acting.begin(), acting.end(), i->first.osd);
acting.begin(), acting.end(), i.first.osd);
if (acting_it != acting.end())
continue;
if (i->second.is_incomplete() ||
i->second.last_update < oldest_auth_log_entry) {
ss << " shard " << i->first << " (stray) REJECTED "
<< i->second << std::endl;
if (i.second.is_incomplete() ||
i.second.last_update < oldest_auth_log_entry) {
ss << " shard " << i.first << " (stray) REJECTED " << i.second
<< std::endl;
} else {
candidate_by_last_update.push_back(
make_pair(i->second.last_update, i->first.osd));
make_pair(i.second.last_update, i.first.osd));
}
}
@ -1446,22 +1481,17 @@ void PG::calc_replicated_acting(
}
// sort by last_update, in descending order.
std::sort(candidate_by_last_update.begin(), candidate_by_last_update.end(),
[](const std::pair<eversion_t, int> &lhs,
const std::pair<eversion_t, int> &rhs) {
return lhs.first > rhs.first;
}
);
std::sort(candidate_by_last_update.begin(),
candidate_by_last_update.end(), sort_by_eversion);
for (auto &p: candidate_by_last_update) {
ceph_assert(usable < size);
ceph_assert(want->size() < size);
want->push_back(p.second);
pg_shard_t s = pg_shard_t(p.second, shard_id_t::NO_SHARD);
acting_backfill->insert(s);
ss << " shard " << s << " (stray) accepted "
<< all_info.find(s)->second << std::endl;
usable++;
if (usable >= size) {
if (want->size() >= size) {
return;
}
}
@ -1667,6 +1697,8 @@ bool PG::choose_acting(pg_shard_t &auth_log_shard_id,
if (!pool.info.is_erasure())
calc_replicated_acting(
auth_log_shard,
cct->_conf.get_val<uint64_t>(
"osd_force_auth_primary_missing_objects"),
get_osdmap()->get_pg_size(info.pgid.pgid),
acting,
up,
@ -1676,6 +1708,7 @@ bool PG::choose_acting(pg_shard_t &auth_log_shard_id,
&want,
&want_backfill,
&want_acting_backfill,
get_osdmap(),
ss);
else
calc_ec_acting(
@ -1874,6 +1907,7 @@ void PG::activate(ObjectStore::Transaction& t,
pg_log.reset_recovery_pointers();
} else {
dout(10) << "activate - not complete, " << missing << dendl;
info.stats.stats.sum.num_objects_missing = missing.num_missing();
pg_log.activate_not_complete(info);
}

View File

@ -1441,6 +1441,7 @@ protected:
ostream &ss);
static void calc_replicated_acting(
map<pg_shard_t, pg_info_t>::const_iterator auth_log_shard,
uint64_t force_auth_primary_missing_objects,
unsigned size,
const vector<int> &acting,
const vector<int> &up,
@ -1450,6 +1451,7 @@ protected:
vector<int> *want,
set<pg_shard_t> *backfill,
set<pg_shard_t> *acting_backfill,
const OSDMapRef osdmap,
ostream &ss);
void choose_async_recovery_ec(const map<pg_shard_t, pg_info_t> &all_info,
const pg_info_t &auth_info,

View File

@ -753,6 +753,7 @@ public:
void recover_got(hobject_t oid, eversion_t v, pg_info_t &info) {
if (missing.is_missing(oid, v)) {
missing.got(oid, v);
info.stats.stats.sum.num_objects_missing = missing.num_missing();
// raise last_complete?
if (missing.get_items().empty()) {

View File

@ -2411,16 +2411,12 @@ struct pg_info_t {
bool is_empty() const { return last_update.version == 0; }
bool dne() const { return history.epoch_created == 0; }
bool has_missing() const { return last_complete != last_update; }
bool is_incomplete() const { return !last_backfill.is_max(); }
void encode(bufferlist& bl) const;
void decode(bufferlist::const_iterator& p);
void dump(Formatter *f) const;
bool overlaps_with(const pg_info_t &oinfo) const {
return last_update > oinfo.log_tail ?
oinfo.last_update >= log_tail :
last_update >= oinfo.log_tail;
}
static void generate_test_instances(list<pg_info_t*>& o);
};
WRITE_CLASS_ENCODER(pg_info_t)