mirror of
https://github.com/ceph/ceph
synced 2025-03-11 02:39:05 +00:00
osd: support osd_repair_during_recovery
osd_repair_during_recovery=true allow explicitly requested reqair to be scheduled on OSDs with active recovering. Fixes: http://tracker.ceph.com/issues/40620 Signed-off-by: Jeegn Chen <jeegnchen@tencent.com>
This commit is contained in:
parent
b7010730f2
commit
80f4e1f677
@ -107,6 +107,86 @@ function TEST_corrupt_and_repair_replicated() {
|
||||
teardown $dir || return 1
|
||||
}
|
||||
|
||||
#
|
||||
# Allow repair to be scheduled when some recovering is still undergoing on the same OSD
|
||||
#
|
||||
function TEST_allow_repair_during_recovery() {
|
||||
local dir=$1
|
||||
local poolname=rbd
|
||||
|
||||
setup $dir || return 1
|
||||
run_mon $dir a --osd_pool_default_size=2 || return 1
|
||||
run_mgr $dir x || return 1
|
||||
run_osd $dir 0 --osd_scrub_during_recovery=false \
|
||||
--osd_repair_during_recovery=true \
|
||||
--osd_debug_pretend_recovery_active=true || return 1
|
||||
run_osd $dir 1 --osd_scrub_during_recovery=false \
|
||||
--osd_repair_during_recovery=true \
|
||||
--osd_debug_pretend_recovery_active=true || return 1
|
||||
create_rbd_pool || return 1
|
||||
wait_for_clean || return 1
|
||||
|
||||
add_something $dir $poolname || return 1
|
||||
corrupt_and_repair_one $dir $poolname $(get_not_primary $poolname SOMETHING) || return 1
|
||||
|
||||
teardown $dir || return 1
|
||||
}
|
||||
|
||||
#
|
||||
# Skip non-repair scrub correctly during recovery
|
||||
#
|
||||
function TEST_skip_non_repair_during_recovery() {
|
||||
local dir=$1
|
||||
local poolname=rbd
|
||||
|
||||
setup $dir || return 1
|
||||
run_mon $dir a --osd_pool_default_size=2 || return 1
|
||||
run_mgr $dir x || return 1
|
||||
run_osd $dir 0 --osd_scrub_during_recovery=false \
|
||||
--osd_repair_during_recovery=true \
|
||||
--osd_debug_pretend_recovery_active=true || return 1
|
||||
run_osd $dir 1 --osd_scrub_during_recovery=false \
|
||||
--osd_repair_during_recovery=true \
|
||||
--osd_debug_pretend_recovery_active=true || return 1
|
||||
create_rbd_pool || return 1
|
||||
wait_for_clean || return 1
|
||||
|
||||
add_something $dir $poolname || return 1
|
||||
scrub_and_not_schedule $dir $poolname $(get_not_primary $poolname SOMETHING) || return 1
|
||||
|
||||
teardown $dir || return 1
|
||||
}
|
||||
|
||||
function scrub_and_not_schedule() {
|
||||
local dir=$1
|
||||
local poolname=$2
|
||||
local osd=$3
|
||||
|
||||
#
|
||||
# 1) start a non-repair scrub
|
||||
#
|
||||
local pg=$(get_pg $poolname SOMETHING)
|
||||
local last_scrub=$(get_last_scrub_stamp $pg)
|
||||
ceph pg scrub $pg
|
||||
|
||||
#
|
||||
# 2) Assure the scrub is not scheduled
|
||||
#
|
||||
for ((i=0; i < 3; i++)); do
|
||||
if test "$(get_last_scrub_stamp $pg)" '>' "$last_scrub" ; then
|
||||
return 1
|
||||
fi
|
||||
sleep 1
|
||||
done
|
||||
|
||||
#
|
||||
# 3) Access to the file must OK
|
||||
#
|
||||
objectstore_tool $dir $osd SOMETHING list-attrs || return 1
|
||||
rados --pool $poolname get SOMETHING $dir/COPY || return 1
|
||||
diff $dir/ORIGINAL $dir/COPY || return 1
|
||||
}
|
||||
|
||||
function corrupt_and_repair_two() {
|
||||
local dir=$1
|
||||
local poolname=$2
|
||||
|
@ -699,6 +699,7 @@ OPTION(osd_max_push_cost, OPT_U64) // max size of push message
|
||||
OPTION(osd_max_push_objects, OPT_U64) // max objects in single push op
|
||||
OPTION(osd_max_scrubs, OPT_INT)
|
||||
OPTION(osd_scrub_during_recovery, OPT_BOOL) // Allow new scrubs to start while recovery is active on the OSD
|
||||
OPTION(osd_repair_during_recovery, OPT_BOOL) // Allow new requested repairs to start while recovery is active on the OSD
|
||||
OPTION(osd_scrub_begin_hour, OPT_INT)
|
||||
OPTION(osd_scrub_end_hour, OPT_INT)
|
||||
OPTION(osd_scrub_begin_week_day, OPT_INT)
|
||||
@ -767,6 +768,7 @@ OPTION(osd_debug_random_push_read_error, OPT_DOUBLE)
|
||||
OPTION(osd_debug_verify_cached_snaps, OPT_BOOL)
|
||||
OPTION(osd_debug_deep_scrub_sleep, OPT_FLOAT)
|
||||
OPTION(osd_debug_no_acting_change, OPT_BOOL)
|
||||
OPTION(osd_debug_pretend_recovery_active, OPT_BOOL)
|
||||
OPTION(osd_enable_op_tracker, OPT_BOOL) // enable/disable OSD op tracking
|
||||
OPTION(osd_num_op_tracker_shard, OPT_U32) // The number of shards for holding the ops
|
||||
OPTION(osd_op_history_size, OPT_U32) // Max number of completed ops to track
|
||||
|
@ -3406,6 +3406,10 @@ std::vector<Option> get_global_options() {
|
||||
.set_default(false)
|
||||
.set_description("Allow scrubbing when PGs on the OSD are undergoing recovery"),
|
||||
|
||||
Option("osd_repair_during_recovery", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
|
||||
.set_default(false)
|
||||
.set_description("Allow requested repairing when PGs on the OSD are undergoing recovery"),
|
||||
|
||||
Option("osd_scrub_begin_hour", Option::TYPE_INT, Option::LEVEL_ADVANCED)
|
||||
.set_default(0)
|
||||
.set_description("Restrict scrubbing to this hour of the day or later")
|
||||
@ -3723,6 +3727,10 @@ std::vector<Option> get_global_options() {
|
||||
Option("osd_debug_no_purge_strays", Option::TYPE_BOOL, Option::LEVEL_DEV)
|
||||
.set_default(false),
|
||||
|
||||
Option("osd_debug_pretend_recovery_active", Option::TYPE_BOOL, Option::LEVEL_DEV)
|
||||
.set_default(false)
|
||||
.set_description(""),
|
||||
|
||||
Option("osd_enable_op_tracker", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
|
||||
.set_default(true)
|
||||
.set_description(""),
|
||||
|
@ -7274,12 +7274,19 @@ void OSD::sched_scrub()
|
||||
if (!service.can_inc_scrubs_pending()) {
|
||||
return;
|
||||
}
|
||||
if (!cct->_conf->osd_scrub_during_recovery && service.is_recovery_active()) {
|
||||
dout(20) << __func__ << " not scheduling scrubs due to active recovery" << dendl;
|
||||
return;
|
||||
bool allow_requested_repair_only = false;
|
||||
if (service.is_recovery_active()) {
|
||||
if (!cct->_conf->osd_scrub_during_recovery && cct->_conf->osd_repair_during_recovery) {
|
||||
dout(10) << __func__
|
||||
<< " will only schedule explicitly requested repair due to active recovery"
|
||||
<< dendl;
|
||||
allow_requested_repair_only = true;
|
||||
} else if (!cct->_conf->osd_scrub_during_recovery && !cct->_conf->osd_repair_during_recovery) {
|
||||
dout(20) << __func__ << " not scheduling scrubs due to active recovery" << dendl;
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
utime_t now = ceph_clock_now();
|
||||
bool time_permit = scrub_time_permit(now);
|
||||
bool load_is_low = scrub_load_below_threshold();
|
||||
@ -7312,6 +7319,14 @@ void OSD::sched_scrub()
|
||||
dout(30) << __func__ << ": already in progress pgid " << scrub.pgid << dendl;
|
||||
continue;
|
||||
}
|
||||
// Skip other kinds of scrubing if only explicitly requested repairing is allowed
|
||||
if (allow_requested_repair_only && !pg->scrubber.must_repair) {
|
||||
pg->unlock();
|
||||
dout(10) << __func__ << " skip " << scrub.pgid
|
||||
<< " because repairing is not explicitly requested on it"
|
||||
<< dendl;
|
||||
continue;
|
||||
}
|
||||
// If it is reserving, let it resolve before going to the next scrub job
|
||||
if (pg->scrubber.reserved) {
|
||||
pg->unlock();
|
||||
@ -9425,6 +9440,9 @@ void OSDService::finish_recovery_op(PG *pg, const hobject_t& soid, bool dequeue)
|
||||
|
||||
bool OSDService::is_recovery_active()
|
||||
{
|
||||
if (cct->_conf->osd_debug_pretend_recovery_active) {
|
||||
return true;
|
||||
}
|
||||
return local_reserver.has_reservation() || remote_reserver.has_reservation();
|
||||
}
|
||||
|
||||
|
@ -1421,7 +1421,10 @@ bool PG::sched_scrub()
|
||||
scrubber.need_auto = false;
|
||||
|
||||
ceph_assert(scrubber.reserved_peers.empty());
|
||||
if ((cct->_conf->osd_scrub_during_recovery || !osd->is_recovery_active()) &&
|
||||
bool allow_scrubing = cct->_conf->osd_scrub_during_recovery ||
|
||||
(cct->_conf->osd_repair_during_recovery && scrubber.must_repair) ||
|
||||
!osd->is_recovery_active();
|
||||
if (allow_scrubing &&
|
||||
osd->inc_scrubs_pending()) {
|
||||
dout(20) << __func__ << ": reserved locally, reserving replicas" << dendl;
|
||||
scrubber.reserved = true;
|
||||
|
Loading…
Reference in New Issue
Block a user