qa: add new test case for pulling error

Signed-off-by: xie xingguo <xie.xingguo@zte.com.cn>
This commit is contained in:
xie xingguo 2019-04-02 16:17:52 +08:00
parent 11f072fee1
commit 6a8aedc107
4 changed files with 38 additions and 8 deletions

View File

@ -122,15 +122,27 @@ function rados_get_data() {
COUNT=$(ceph pg dump --format=json-pretty | jq ".pg_map.osd_stats_sum.num_shards_repaired")
test "$COUNT" = "1" || return 1
local object_osds=($(get_osds $poolname $objname))
local primary=${object_osds[0]}
local bad_peer=${object_osds[1]}
inject_$inject rep data $poolname $objname $dir 0 || return 1
inject_$inject rep data $poolname $objname $dir 1 || return 1
# Force primary to pull from the bad peer, so we can repair it too!
set_config osd $primary osd_debug_feed_pullee $bad_peer || return 1
rados_get $dir $poolname $objname || return 1
# Wait until automatic repair of bad peer is done
wait_for_clean || return 1
inject_$inject rep data $poolname $objname $dir 0 || return 1
inject_$inject rep data $poolname $objname $dir 2 || return 1
rados_get $dir $poolname $objname || return 1
COUNT=$(ceph pg $pgid query | jq '.info.stats.stat_sum.num_objects_repaired')
test "$COUNT" = "2" || return 1
test "$COUNT" = "3" || return 1
flush_pg_stats
COUNT=$(ceph pg dump --format=json-pretty | jq ".pg_map.osd_stats_sum.num_shards_repaired")
test "$COUNT" = "2" || return 1
test "$COUNT" = "4" || return 1
inject_$inject rep data $poolname $objname $dir 0 || return 1
inject_$inject rep data $poolname $objname $dir 1 || return 1
@ -139,10 +151,10 @@ function rados_get_data() {
# After hang another repair couldn't happen, so count stays the same
COUNT=$(ceph pg $pgid query | jq '.info.stats.stat_sum.num_objects_repaired')
test "$COUNT" = "2" || return 1
test "$COUNT" = "3" || return 1
flush_pg_stats
COUNT=$(ceph pg dump --format=json-pretty | jq ".pg_map.osd_stats_sum.num_shards_repaired")
test "$COUNT" = "2" || return 1
test "$COUNT" = "4" || return 1
}
function TEST_rados_get_with_eio() {

View File

@ -643,6 +643,7 @@ OPTION(osd_read_ec_check_for_errors, OPT_BOOL) // return error if any ec shard h
// Only use clone_overlap for recovery if there are fewer than
// osd_recover_clone_overlap_limit entries in the overlap set
OPTION(osd_recover_clone_overlap_limit, OPT_INT)
OPTION(osd_debug_feed_pullee, OPT_INT)
OPTION(osd_backfill_scan_min, OPT_INT)
OPTION(osd_backfill_scan_max, OPT_INT)

View File

@ -3123,6 +3123,11 @@ std::vector<Option> get_global_options() {
.set_default(10)
.set_description(""),
Option("osd_debug_feed_pullee", Option::TYPE_INT, Option::LEVEL_DEV)
.set_default(-1)
.set_description("Feed a pullee, and force primary to pull "
"a currently missing object from it"),
Option("osd_backfill_scan_min", Option::TYPE_INT, Option::LEVEL_ADVANCED)
.set_default(64)
.set_description(""),

View File

@ -1299,10 +1299,22 @@ void ReplicatedBackend::prepare_pull(
ceph_assert(!q->second.empty());
// pick a pullee
auto p = q->second.begin();
std::advance(p,
util::generate_random_number<int>(0,
q->second.size() - 1));
auto p = q->second.end();
if (cct->_conf->osd_debug_feed_pullee >= 0) {
for (auto it = q->second.begin(); it != q->second.end(); it++) {
if (it->osd == cct->_conf->osd_debug_feed_pullee) {
p = it;
break;
}
}
}
if (p == q->second.end()) {
// probably because user feed a wrong pullee
p = q->second.begin();
std::advance(p,
util::generate_random_number<int>(0,
q->second.size() - 1));
}
ceph_assert(get_osdmap()->is_up(p->osd));
pg_shard_t fromshard = *p;