mirror of
https://github.com/ceph/ceph
synced 2024-12-12 14:39:05 +00:00
33d2a2c93b
The following tests in the test files mentioned below use the "osd_scrub_sleep" option to introduce delays during scrubbing to help determine scrubbing states, validate reservations during scrubbing etc.. This works when using the "wpq" scheduler. But when the "mclock_scheduler" is enabled, the "osd_scrub_sleep" is disabled and overridden to 0. This is done to delegate the scheduling of the background scrubs to the "mclock_scheduler" based on the set QoS parameters. Due to this, the checks to verify the scrub states, reservations etc. fail since the window to check them is very short due to scrubs completing very quickly. This affects a small subset of scrub tests mentioned below, 1. osd-scrub-dump.sh -> TEST_recover_unexpected() 2. osd-scrub-repair.sh -> TEST_auto_repair_bluestore_tag() 3. osd-scrub-test.sh -> TEST_scrub_abort(), TEST_deep_scrub_abort() Only for the above tests, until there's a reliable way to query scrub states with "--osd-scrub-sleep" set to 0, the "osd_op_queue" config option is set to "wpq". Signed-off-by: Sridhar Seshasayee <sseshasa@redhat.com>
463 lines
13 KiB
Bash
Executable File
463 lines
13 KiB
Bash
Executable File
#!/usr/bin/env bash
|
|
#
|
|
# Copyright (C) 2018 Red Hat <contact@redhat.com>
|
|
#
|
|
# Author: David Zafman <dzafman@redhat.com>
|
|
#
|
|
# This program is free software; you can redistribute it and/or modify
|
|
# it under the terms of the GNU Library Public License as published by
|
|
# the Free Software Foundation; either version 2, or (at your option)
|
|
# any later version.
|
|
#
|
|
# This program is distributed in the hope that it will be useful,
|
|
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
# GNU Library Public License for more details.
|
|
#
|
|
source $CEPH_ROOT/qa/standalone/ceph-helpers.sh
|
|
|
|
function run() {
|
|
local dir=$1
|
|
shift
|
|
|
|
export CEPH_MON="127.0.0.1:7138" # git grep '\<7138\>' : there must be only one
|
|
export CEPH_ARGS
|
|
CEPH_ARGS+="--fsid=$(uuidgen) --auth-supported=none "
|
|
CEPH_ARGS+="--mon-host=$CEPH_MON "
|
|
|
|
export -n CEPH_CLI_TEST_DUP_COMMAND
|
|
local funcs=${@:-$(set | sed -n -e 's/^\(TEST_[0-9a-z_]*\) .*/\1/p')}
|
|
for func in $funcs ; do
|
|
$func $dir || return 1
|
|
done
|
|
}
|
|
|
|
function TEST_scrub_test() {
|
|
local dir=$1
|
|
local poolname=test
|
|
local OSDS=3
|
|
local objects=15
|
|
|
|
TESTDATA="testdata.$$"
|
|
|
|
setup $dir || return 1
|
|
run_mon $dir a --osd_pool_default_size=3 || return 1
|
|
run_mgr $dir x || return 1
|
|
for osd in $(seq 0 $(expr $OSDS - 1))
|
|
do
|
|
run_osd $dir $osd || return 1
|
|
done
|
|
|
|
# Create a pool with a single pg
|
|
create_pool $poolname 1 1
|
|
wait_for_clean || return 1
|
|
poolid=$(ceph osd dump | grep "^pool.*[']${poolname}[']" | awk '{ print $2 }')
|
|
|
|
dd if=/dev/urandom of=$TESTDATA bs=1032 count=1
|
|
for i in `seq 1 $objects`
|
|
do
|
|
rados -p $poolname put obj${i} $TESTDATA
|
|
done
|
|
rm -f $TESTDATA
|
|
|
|
local primary=$(get_primary $poolname obj1)
|
|
local otherosd=$(get_not_primary $poolname obj1)
|
|
if [ "$otherosd" = "2" ];
|
|
then
|
|
local anotherosd="0"
|
|
else
|
|
local anotherosd="2"
|
|
fi
|
|
|
|
objectstore_tool $dir $anotherosd obj1 set-bytes /etc/fstab
|
|
|
|
local pgid="${poolid}.0"
|
|
pg_deep_scrub "$pgid" || return 1
|
|
|
|
ceph pg dump pgs | grep ^${pgid} | grep -q -- +inconsistent || return 1
|
|
test "$(ceph pg $pgid query | jq '.info.stats.stat_sum.num_scrub_errors')" = "2" || return 1
|
|
|
|
ceph osd out $primary
|
|
wait_for_clean || return 1
|
|
|
|
pg_deep_scrub "$pgid" || return 1
|
|
|
|
test "$(ceph pg $pgid query | jq '.info.stats.stat_sum.num_scrub_errors')" = "2" || return 1
|
|
test "$(ceph pg $pgid query | jq '.peer_info[0].stats.stat_sum.num_scrub_errors')" = "2" || return 1
|
|
ceph pg dump pgs | grep ^${pgid} | grep -q -- +inconsistent || return 1
|
|
|
|
ceph osd in $primary
|
|
wait_for_clean || return 1
|
|
|
|
repair "$pgid" || return 1
|
|
wait_for_clean || return 1
|
|
|
|
# This sets up the test after we've repaired with previous primary has old value
|
|
test "$(ceph pg $pgid query | jq '.peer_info[0].stats.stat_sum.num_scrub_errors')" = "2" || return 1
|
|
ceph pg dump pgs | grep ^${pgid} | grep -vq -- +inconsistent || return 1
|
|
|
|
ceph osd out $primary
|
|
wait_for_clean || return 1
|
|
|
|
test "$(ceph pg $pgid query | jq '.info.stats.stat_sum.num_scrub_errors')" = "0" || return 1
|
|
test "$(ceph pg $pgid query | jq '.peer_info[0].stats.stat_sum.num_scrub_errors')" = "0" || return 1
|
|
test "$(ceph pg $pgid query | jq '.peer_info[1].stats.stat_sum.num_scrub_errors')" = "0" || return 1
|
|
ceph pg dump pgs | grep ^${pgid} | grep -vq -- +inconsistent || return 1
|
|
|
|
teardown $dir || return 1
|
|
}
|
|
|
|
# Grab year-month-day
|
|
DATESED="s/\([0-9]*-[0-9]*-[0-9]*\).*/\1/"
|
|
DATEFORMAT="%Y-%m-%d"
|
|
|
|
function check_dump_scrubs() {
|
|
local primary=$1
|
|
local sched_time_check="$2"
|
|
local deadline_check="$3"
|
|
|
|
DS="$(CEPH_ARGS='' ceph --admin-daemon $(get_asok_path osd.${primary}) dump_scrubs)"
|
|
# use eval to drop double-quotes
|
|
eval SCHED_TIME=$(echo $DS | jq '.[0].sched_time')
|
|
test $(echo $SCHED_TIME | sed $DATESED) = $(date +${DATEFORMAT} -d "now + $sched_time_check") || return 1
|
|
# use eval to drop double-quotes
|
|
eval DEADLINE=$(echo $DS | jq '.[0].deadline')
|
|
test $(echo $DEADLINE | sed $DATESED) = $(date +${DATEFORMAT} -d "now + $deadline_check") || return 1
|
|
}
|
|
|
|
function TEST_interval_changes() {
|
|
local poolname=test
|
|
local OSDS=2
|
|
local objects=10
|
|
# Don't assume how internal defaults are set
|
|
local day="$(expr 24 \* 60 \* 60)"
|
|
local week="$(expr $day \* 7)"
|
|
local min_interval=$day
|
|
local max_interval=$week
|
|
local WAIT_FOR_UPDATE=15
|
|
|
|
TESTDATA="testdata.$$"
|
|
|
|
setup $dir || return 1
|
|
# This min scrub interval results in 30 seconds backoff time
|
|
run_mon $dir a --osd_pool_default_size=$OSDS || return 1
|
|
run_mgr $dir x || return 1
|
|
for osd in $(seq 0 $(expr $OSDS - 1))
|
|
do
|
|
run_osd $dir $osd --osd_scrub_min_interval=$min_interval --osd_scrub_max_interval=$max_interval --osd_scrub_interval_randomize_ratio=0 || return 1
|
|
done
|
|
|
|
# Create a pool with a single pg
|
|
create_pool $poolname 1 1
|
|
wait_for_clean || return 1
|
|
local poolid=$(ceph osd dump | grep "^pool.*[']${poolname}[']" | awk '{ print $2 }')
|
|
|
|
dd if=/dev/urandom of=$TESTDATA bs=1032 count=1
|
|
for i in `seq 1 $objects`
|
|
do
|
|
rados -p $poolname put obj${i} $TESTDATA
|
|
done
|
|
rm -f $TESTDATA
|
|
|
|
local primary=$(get_primary $poolname obj1)
|
|
|
|
# Check initial settings from above (min 1 day, min 1 week)
|
|
check_dump_scrubs $primary "1 day" "1 week" || return 1
|
|
|
|
# Change global osd_scrub_min_interval to 2 days
|
|
CEPH_ARGS='' ceph --admin-daemon $(get_asok_path osd.${primary}) config set osd_scrub_min_interval $(expr $day \* 2)
|
|
sleep $WAIT_FOR_UPDATE
|
|
check_dump_scrubs $primary "2 days" "1 week" || return 1
|
|
|
|
# Change global osd_scrub_max_interval to 2 weeks
|
|
CEPH_ARGS='' ceph --admin-daemon $(get_asok_path osd.${primary}) config set osd_scrub_max_interval $(expr $week \* 2)
|
|
sleep $WAIT_FOR_UPDATE
|
|
check_dump_scrubs $primary "2 days" "2 week" || return 1
|
|
|
|
# Change pool osd_scrub_min_interval to 3 days
|
|
ceph osd pool set $poolname scrub_min_interval $(expr $day \* 3)
|
|
sleep $WAIT_FOR_UPDATE
|
|
check_dump_scrubs $primary "3 days" "2 week" || return 1
|
|
|
|
# Change pool osd_scrub_max_interval to 3 weeks
|
|
ceph osd pool set $poolname scrub_max_interval $(expr $week \* 3)
|
|
sleep $WAIT_FOR_UPDATE
|
|
check_dump_scrubs $primary "3 days" "3 week" || return 1
|
|
|
|
teardown $dir || return 1
|
|
}
|
|
|
|
function TEST_scrub_extended_sleep() {
|
|
local dir=$1
|
|
local poolname=test
|
|
local OSDS=3
|
|
local objects=15
|
|
|
|
TESTDATA="testdata.$$"
|
|
|
|
DAY=$(date +%w)
|
|
# Handle wrap
|
|
if [ "$DAY" -ge "4" ];
|
|
then
|
|
DAY="0"
|
|
fi
|
|
# Start after 2 days in case we are near midnight
|
|
DAY_START=$(expr $DAY + 2)
|
|
DAY_END=$(expr $DAY + 3)
|
|
|
|
setup $dir || return 1
|
|
run_mon $dir a --osd_pool_default_size=3 || return 1
|
|
run_mgr $dir x || return 1
|
|
for osd in $(seq 0 $(expr $OSDS - 1))
|
|
do
|
|
run_osd $dir $osd --osd_scrub_sleep=0 \
|
|
--osd_scrub_extended_sleep=20 \
|
|
--bluestore_cache_autotune=false \
|
|
--osd_deep_scrub_randomize_ratio=0.0 \
|
|
--osd_scrub_interval_randomize_ratio=0 \
|
|
--osd_scrub_begin_week_day=$DAY_START \
|
|
--osd_scrub_end_week_day=$DAY_END \
|
|
|| return 1
|
|
done
|
|
|
|
# Create a pool with a single pg
|
|
create_pool $poolname 1 1
|
|
wait_for_clean || return 1
|
|
|
|
# Trigger a scrub on a PG
|
|
local pgid=$(get_pg $poolname SOMETHING)
|
|
local primary=$(get_primary $poolname SOMETHING)
|
|
local last_scrub=$(get_last_scrub_stamp $pgid)
|
|
ceph tell $pgid scrub || return 1
|
|
|
|
# Allow scrub to start extended sleep
|
|
PASSED="false"
|
|
for ((i=0; i < 15; i++)); do
|
|
if grep -q "scrub state.*, sleeping" $dir/osd.${primary}.log
|
|
then
|
|
PASSED="true"
|
|
break
|
|
fi
|
|
sleep 1
|
|
done
|
|
|
|
# Check that extended sleep was triggered
|
|
if [ $PASSED = "false" ];
|
|
then
|
|
return 1
|
|
fi
|
|
|
|
# release scrub to run after extended sleep finishes
|
|
ceph tell osd.$primary config set osd_scrub_begin_week_day 0
|
|
ceph tell osd.$primary config set osd_scrub_end_week_day 0
|
|
|
|
# Due to extended sleep, the scrub should not be done within 20 seconds
|
|
# but test up to 10 seconds and make sure it happens by 25 seconds.
|
|
count=0
|
|
PASSED="false"
|
|
for ((i=0; i < 25; i++)); do
|
|
count=$(expr $count + 1)
|
|
if test "$(get_last_scrub_stamp $pgid)" '>' "$last_scrub" ; then
|
|
# Did scrub run too soon?
|
|
if [ $count -lt "10" ];
|
|
then
|
|
return 1
|
|
fi
|
|
PASSED="true"
|
|
break
|
|
fi
|
|
sleep 1
|
|
done
|
|
|
|
# Make sure scrub eventually ran
|
|
if [ $PASSED = "false" ];
|
|
then
|
|
return 1
|
|
fi
|
|
|
|
teardown $dir || return 1
|
|
}
|
|
|
|
function _scrub_abort() {
|
|
local dir=$1
|
|
local poolname=test
|
|
local OSDS=3
|
|
local objects=1000
|
|
local type=$2
|
|
|
|
TESTDATA="testdata.$$"
|
|
if test $type = "scrub";
|
|
then
|
|
stopscrub="noscrub"
|
|
check="noscrub"
|
|
else
|
|
stopscrub="nodeep-scrub"
|
|
check="nodeep_scrub"
|
|
fi
|
|
|
|
|
|
setup $dir || return 1
|
|
run_mon $dir a --osd_pool_default_size=3 || return 1
|
|
run_mgr $dir x || return 1
|
|
for osd in $(seq 0 $(expr $OSDS - 1))
|
|
do
|
|
# Set scheduler to "wpq" until there's a reliable way to query scrub
|
|
# states with "--osd-scrub-sleep" set to 0. The "mclock_scheduler"
|
|
# overrides the scrub sleep to 0 and as a result the checks in the
|
|
# test fail.
|
|
run_osd $dir $osd --osd_pool_default_pg_autoscale_mode=off \
|
|
--osd_deep_scrub_randomize_ratio=0.0 \
|
|
--osd_scrub_sleep=5.0 \
|
|
--osd_scrub_interval_randomize_ratio=0 \
|
|
--osd_op_queue=wpq || return 1
|
|
done
|
|
|
|
# Create a pool with a single pg
|
|
create_pool $poolname 1 1
|
|
wait_for_clean || return 1
|
|
poolid=$(ceph osd dump | grep "^pool.*[']${poolname}[']" | awk '{ print $2 }')
|
|
|
|
dd if=/dev/urandom of=$TESTDATA bs=1032 count=1
|
|
for i in `seq 1 $objects`
|
|
do
|
|
rados -p $poolname put obj${i} $TESTDATA
|
|
done
|
|
rm -f $TESTDATA
|
|
|
|
local primary=$(get_primary $poolname obj1)
|
|
local pgid="${poolid}.0"
|
|
|
|
ceph tell $pgid $type || return 1
|
|
# deep-scrub won't start without scrub noticing
|
|
if [ "$type" = "deep_scrub" ];
|
|
then
|
|
ceph tell $pgid scrub || return 1
|
|
fi
|
|
|
|
# Wait for scrubbing to start
|
|
set -o pipefail
|
|
found="no"
|
|
for i in $(seq 0 200)
|
|
do
|
|
flush_pg_stats
|
|
if ceph pg dump pgs | grep ^$pgid| grep -q "scrubbing"
|
|
then
|
|
found="yes"
|
|
#ceph pg dump pgs
|
|
break
|
|
fi
|
|
done
|
|
set +o pipefail
|
|
|
|
if test $found = "no";
|
|
then
|
|
echo "Scrubbing never started"
|
|
return 1
|
|
fi
|
|
|
|
ceph osd set $stopscrub
|
|
if [ "$type" = "deep_scrub" ];
|
|
then
|
|
ceph osd set noscrub
|
|
fi
|
|
|
|
# Wait for scrubbing to end
|
|
set -o pipefail
|
|
for i in $(seq 0 200)
|
|
do
|
|
flush_pg_stats
|
|
if ceph pg dump pgs | grep ^$pgid | grep -q "scrubbing"
|
|
then
|
|
continue
|
|
fi
|
|
#ceph pg dump pgs
|
|
break
|
|
done
|
|
set +o pipefail
|
|
|
|
sleep 5
|
|
|
|
if ! grep "$check set, aborting" $dir/osd.${primary}.log
|
|
then
|
|
echo "Abort not seen in log"
|
|
return 1
|
|
fi
|
|
|
|
local last_scrub=$(get_last_scrub_stamp $pgid)
|
|
ceph config set osd "osd_scrub_sleep" "0.1"
|
|
|
|
ceph osd unset $stopscrub
|
|
if [ "$type" = "deep_scrub" ];
|
|
then
|
|
ceph osd unset noscrub
|
|
fi
|
|
TIMEOUT=$(($objects / 2))
|
|
wait_for_scrub $pgid "$last_scrub" || return 1
|
|
|
|
teardown $dir || return 1
|
|
}
|
|
|
|
function TEST_scrub_abort() {
|
|
local dir=$1
|
|
_scrub_abort $dir scrub
|
|
}
|
|
|
|
function TEST_deep_scrub_abort() {
|
|
local dir=$1
|
|
_scrub_abort $dir deep_scrub
|
|
}
|
|
|
|
function TEST_scrub_permit_time() {
|
|
local dir=$1
|
|
local poolname=test
|
|
local OSDS=3
|
|
local objects=15
|
|
|
|
TESTDATA="testdata.$$"
|
|
|
|
setup $dir || return 1
|
|
run_mon $dir a --osd_pool_default_size=3 || return 1
|
|
run_mgr $dir x || return 1
|
|
local scrub_begin_hour=$(date -d '2 hour ago' +"%H" | sed 's/^0//')
|
|
local scrub_end_hour=$(date -d '1 hour ago' +"%H" | sed 's/^0//')
|
|
for osd in $(seq 0 $(expr $OSDS - 1))
|
|
do
|
|
run_osd $dir $osd --bluestore_cache_autotune=false \
|
|
--osd_deep_scrub_randomize_ratio=0.0 \
|
|
--osd_scrub_interval_randomize_ratio=0 \
|
|
--osd_scrub_begin_hour=$scrub_begin_hour \
|
|
--osd_scrub_end_hour=$scrub_end_hour || return 1
|
|
done
|
|
|
|
# Create a pool with a single pg
|
|
create_pool $poolname 1 1
|
|
wait_for_clean || return 1
|
|
|
|
# Trigger a scrub on a PG
|
|
local pgid=$(get_pg $poolname SOMETHING)
|
|
local primary=$(get_primary $poolname SOMETHING)
|
|
local last_scrub=$(get_last_scrub_stamp $pgid)
|
|
# If we don't specify an amount of time to subtract from
|
|
# current time to set last_scrub_stamp, it sets the deadline
|
|
# back by osd_max_interval which would cause the time permit checking
|
|
# to be skipped. Set back 1 day, the default scrub_min_interval.
|
|
ceph tell $pgid scrub $(( 24 * 60 * 60 )) || return 1
|
|
|
|
# Scrub should not run
|
|
for ((i=0; i < 30; i++)); do
|
|
if test "$(get_last_scrub_stamp $pgid)" '>' "$last_scrub" ; then
|
|
return 1
|
|
fi
|
|
sleep 1
|
|
done
|
|
|
|
teardown $dir || return 1
|
|
}
|
|
|
|
main osd-scrub-test "$@"
|
|
|
|
# Local Variables:
|
|
# compile-command: "cd build ; make -j4 && \
|
|
# ../qa/run-standalone.sh osd-scrub-test.sh"
|
|
# End:
|