mirror of
https://github.com/ceph/ceph
synced 2025-01-09 04:30:26 +00:00
eec821b6e5
Change TEST_recovery_scrub_2 to create more objects and use osd_recovery_sleep to prevent recovery from finihing before we start to scrub. Verify that at least 1 scrub was started while the pg was reovering. Fixes: https://tracker.ceph.com/issues/49779 Signed-off-by: David Zafman <dzafman@redhat.com>
353 lines
9.3 KiB
Bash
Executable File
353 lines
9.3 KiB
Bash
Executable File
#! /usr/bin/env bash
|
|
#
|
|
# Copyright (C) 2017 Red Hat <contact@redhat.com>
|
|
#
|
|
# Author: David Zafman <dzafman@redhat.com>
|
|
#
|
|
# This program is free software; you can redistribute it and/or modify
|
|
# it under the terms of the GNU Library Public License as published by
|
|
# the Free Software Foundation; either version 2, or (at your option)
|
|
# any later version.
|
|
#
|
|
# This program is distributed in the hope that it will be useful,
|
|
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
# GNU Library Public License for more details.
|
|
#
|
|
source $CEPH_ROOT/qa/standalone/ceph-helpers.sh
|
|
|
|
function run() {
|
|
local dir=$1
|
|
shift
|
|
|
|
export CEPH_MON="127.0.0.1:7124" # git grep '\<7124\>' : there must be only one
|
|
export CEPH_ARGS
|
|
CEPH_ARGS+="--fsid=$(uuidgen) --auth-supported=none "
|
|
CEPH_ARGS+="--mon-host=$CEPH_MON "
|
|
|
|
export -n CEPH_CLI_TEST_DUP_COMMAND
|
|
local funcs=${@:-$(set | sed -n -e 's/^\(TEST_[0-9a-z_]*\) .*/\1/p')}
|
|
for func in $funcs ; do
|
|
$func $dir || return 1
|
|
done
|
|
}
|
|
|
|
# Simple test for "not scheduling scrubs due to active recovery"
|
|
# OSD::sched_scrub() called on all OSDs during ticks
|
|
function TEST_recovery_scrub_1() {
|
|
local dir=$1
|
|
local poolname=test
|
|
|
|
TESTDATA="testdata.$$"
|
|
OSDS=4
|
|
PGS=1
|
|
OBJECTS=100
|
|
ERRORS=0
|
|
|
|
setup $dir || return 1
|
|
run_mon $dir a --osd_pool_default_size=1 --mon_allow_pool_size_one=true \
|
|
--osd_scrub_interval_randomize_ratio=0.0 || return 1
|
|
run_mgr $dir x || return 1
|
|
for osd in $(seq 0 $(expr $OSDS - 1))
|
|
do
|
|
run_osd $dir $osd --osd_scrub_during_recovery=false || return 1
|
|
done
|
|
|
|
# Create a pool with $PGS pgs
|
|
create_pool $poolname $PGS $PGS
|
|
wait_for_clean || return 1
|
|
poolid=$(ceph osd dump | grep "^pool.*[']test[']" | awk '{ print $2 }')
|
|
|
|
ceph pg dump pgs
|
|
|
|
dd if=/dev/urandom of=$TESTDATA bs=1M count=50
|
|
for i in $(seq 1 $OBJECTS)
|
|
do
|
|
rados -p $poolname put obj${i} $TESTDATA
|
|
done
|
|
rm -f $TESTDATA
|
|
|
|
ceph osd pool set $poolname size 4
|
|
|
|
# Wait for recovery to start
|
|
set -o pipefail
|
|
count=0
|
|
while(true)
|
|
do
|
|
if ceph --format json pg dump pgs |
|
|
jq '.pg_stats | [.[] | .state | contains("recovering")]' | grep -q true
|
|
then
|
|
break
|
|
fi
|
|
sleep 2
|
|
if test "$count" -eq "10"
|
|
then
|
|
echo "Recovery never started"
|
|
return 1
|
|
fi
|
|
count=$(expr $count + 1)
|
|
done
|
|
set +o pipefail
|
|
ceph pg dump pgs
|
|
|
|
sleep 10
|
|
# Work around for http://tracker.ceph.com/issues/38195
|
|
kill_daemons $dir #|| return 1
|
|
|
|
declare -a err_strings
|
|
err_strings[0]="not scheduling scrubs due to active recovery"
|
|
|
|
for osd in $(seq 0 $(expr $OSDS - 1))
|
|
do
|
|
grep "not scheduling scrubs" $dir/osd.${osd}.log
|
|
done
|
|
for err_string in "${err_strings[@]}"
|
|
do
|
|
found=false
|
|
count=0
|
|
for osd in $(seq 0 $(expr $OSDS - 1))
|
|
do
|
|
if grep -q "$err_string" $dir/osd.${osd}.log
|
|
then
|
|
found=true
|
|
count=$(expr $count + 1)
|
|
fi
|
|
done
|
|
if [ "$found" = "false" ]; then
|
|
echo "Missing log message '$err_string'"
|
|
ERRORS=$(expr $ERRORS + 1)
|
|
fi
|
|
[ $count -eq $OSDS ] || return 1
|
|
done
|
|
|
|
teardown $dir || return 1
|
|
|
|
if [ $ERRORS != "0" ];
|
|
then
|
|
echo "TEST FAILED WITH $ERRORS ERRORS"
|
|
return 1
|
|
fi
|
|
|
|
echo "TEST PASSED"
|
|
return 0
|
|
}
|
|
|
|
##
|
|
# a modified version of wait_for_scrub(), which terminates if the Primary
|
|
# of the to-be-scrubbed PG changes
|
|
#
|
|
# Given the *last_scrub*, wait for scrub to happen on **pgid**. It
|
|
# will fail if scrub does not complete within $TIMEOUT seconds. The
|
|
# repair is complete whenever the **get_last_scrub_stamp** function
|
|
# reports a timestamp different from the one given in argument.
|
|
#
|
|
# @param pgid the id of the PG
|
|
# @param the primary OSD when started
|
|
# @param last_scrub timestamp of the last scrub for *pgid*
|
|
# @return 0 on success, 1 on error
|
|
#
|
|
function wait_for_scrub_mod() {
|
|
local pgid=$1
|
|
local orig_primary=$2
|
|
local last_scrub="$3"
|
|
local sname=${4:-last_scrub_stamp}
|
|
|
|
for ((i=0; i < $TIMEOUT; i++)); do
|
|
sleep 0.2
|
|
if test "$(get_last_scrub_stamp $pgid $sname)" '>' "$last_scrub" ; then
|
|
return 0
|
|
fi
|
|
sleep 1
|
|
# are we still the primary?
|
|
local current_primary=`bin/ceph pg $pgid query | jq '.acting[0]' `
|
|
if [ $orig_primary != $current_primary ]; then
|
|
echo $orig_primary no longer primary for $pgid
|
|
return 0
|
|
fi
|
|
done
|
|
return 1
|
|
}
|
|
|
|
##
|
|
# A modified version of pg_scrub()
|
|
#
|
|
# Run scrub on **pgid** and wait until it completes. The pg_scrub
|
|
# function will fail if repair does not complete within $TIMEOUT
|
|
# seconds. The pg_scrub is complete whenever the
|
|
# **get_last_scrub_stamp** function reports a timestamp different from
|
|
# the one stored before starting the scrub, or whenever the Primary
|
|
# changes.
|
|
#
|
|
# @param pgid the id of the PG
|
|
# @return 0 on success, 1 on error
|
|
#
|
|
function pg_scrub_mod() {
|
|
local pgid=$1
|
|
local last_scrub=$(get_last_scrub_stamp $pgid)
|
|
# locate the primary
|
|
local my_primary=`bin/ceph pg $pgid query | jq '.acting[0]' `
|
|
local recovery=false
|
|
ceph pg scrub $pgid
|
|
#ceph --format json pg dump pgs | jq ".pg_stats | .[] | select(.pgid == \"$pgid\") | .state"
|
|
if ceph --format json pg dump pgs | jq ".pg_stats | .[] | select(.pgid == \"$pgid\") | .state" | grep -q recovering
|
|
then
|
|
recovery=true
|
|
fi
|
|
wait_for_scrub_mod $pgid $my_primary "$last_scrub" || return 1
|
|
if test $recovery = "true"
|
|
then
|
|
return 2
|
|
fi
|
|
}
|
|
|
|
# Same as wait_background() except that it checks for exit code 2 and bumps recov_scrub_count
|
|
function wait_background_check() {
|
|
# We extract the PIDS from the variable name
|
|
pids=${!1}
|
|
|
|
return_code=0
|
|
for pid in $pids; do
|
|
wait $pid
|
|
retcode=$?
|
|
if test $retcode -eq 2
|
|
then
|
|
recov_scrub_count=$(expr $recov_scrub_count + 1)
|
|
elif test $retcode -ne 0
|
|
then
|
|
# If one process failed then return 1
|
|
return_code=1
|
|
fi
|
|
done
|
|
|
|
# We empty the variable reporting that all process ended
|
|
eval "$1=''"
|
|
|
|
return $return_code
|
|
}
|
|
|
|
# osd_scrub_during_recovery=true make sure scrub happens
|
|
function TEST_recovery_scrub_2() {
|
|
local dir=$1
|
|
local poolname=test
|
|
|
|
TESTDATA="testdata.$$"
|
|
OSDS=8
|
|
PGS=32
|
|
OBJECTS=40
|
|
|
|
setup $dir || return 1
|
|
run_mon $dir a --osd_pool_default_size=1 --mon_allow_pool_size_one=true \
|
|
--osd_scrub_interval_randomize_ratio=0.0 || return 1
|
|
run_mgr $dir x || return 1
|
|
for osd in $(seq 0 $(expr $OSDS - 1))
|
|
do
|
|
run_osd $dir $osd --osd_scrub_during_recovery=true --osd_recovery_sleep=10 || return 1
|
|
done
|
|
|
|
# Create a pool with $PGS pgs
|
|
create_pool $poolname $PGS $PGS
|
|
wait_for_clean || return 1
|
|
poolid=$(ceph osd dump | grep "^pool.*[']test[']" | awk '{ print $2 }')
|
|
|
|
dd if=/dev/urandom of=$TESTDATA bs=1M count=50
|
|
for i in $(seq 1 $OBJECTS)
|
|
do
|
|
rados -p $poolname put obj${i} $TESTDATA
|
|
done
|
|
rm -f $TESTDATA
|
|
|
|
ceph osd pool set $poolname size 3
|
|
|
|
ceph pg dump pgs
|
|
|
|
# Wait for recovery to start
|
|
count=0
|
|
while(true)
|
|
do
|
|
#ceph --format json pg dump pgs | jq '.pg_stats | [.[].state]'
|
|
if test $(ceph --format json pg dump pgs |
|
|
jq '.pg_stats | [.[].state]'| grep recovering | wc -l) -ge 2
|
|
then
|
|
break
|
|
fi
|
|
sleep 2
|
|
if test "$count" -eq "10"
|
|
then
|
|
echo "Not enough recovery started simultaneously"
|
|
return 1
|
|
fi
|
|
count=$(expr $count + 1)
|
|
done
|
|
ceph pg dump pgs
|
|
|
|
pids=""
|
|
recov_scrub_count=0
|
|
for pg in $(seq 0 $(expr $PGS - 1))
|
|
do
|
|
run_in_background pids pg_scrub_mod $poolid.$(printf "%x" $pg)
|
|
done
|
|
wait_background_check pids
|
|
return_code=$?
|
|
if [ $return_code -ne 0 ]; then return $return_code; fi
|
|
|
|
ERRORS=0
|
|
if test $recov_scrub_count -eq 0
|
|
then
|
|
echo "No scrubs occurred while PG recovering"
|
|
ERRORS=$(expr $ERRORS + 1)
|
|
fi
|
|
|
|
pidfile=$(find $dir 2>/dev/null | grep $name_prefix'[^/]*\.pid')
|
|
pid=$(cat $pidfile)
|
|
if ! kill -0 $pid
|
|
then
|
|
echo "OSD crash occurred"
|
|
#tail -100 $dir/osd.0.log
|
|
ERRORS=$(expr $ERRORS + 1)
|
|
fi
|
|
|
|
# Work around for http://tracker.ceph.com/issues/38195
|
|
kill_daemons $dir #|| return 1
|
|
|
|
declare -a err_strings
|
|
err_strings[0]="not scheduling scrubs due to active recovery"
|
|
|
|
for osd in $(seq 0 $(expr $OSDS - 1))
|
|
do
|
|
grep "not scheduling scrubs" $dir/osd.${osd}.log
|
|
done
|
|
for err_string in "${err_strings[@]}"
|
|
do
|
|
found=false
|
|
for osd in $(seq 0 $(expr $OSDS - 1))
|
|
do
|
|
if grep "$err_string" $dir/osd.${osd}.log > /dev/null;
|
|
then
|
|
found=true
|
|
fi
|
|
done
|
|
if [ "$found" = "true" ]; then
|
|
echo "Found log message not expected '$err_string'"
|
|
ERRORS=$(expr $ERRORS + 1)
|
|
fi
|
|
done
|
|
|
|
teardown $dir || return 1
|
|
|
|
if [ $ERRORS != "0" ];
|
|
then
|
|
echo "TEST FAILED WITH $ERRORS ERRORS"
|
|
return 1
|
|
fi
|
|
|
|
echo "TEST PASSED"
|
|
return 0
|
|
}
|
|
|
|
main osd-recovery-scrub "$@"
|
|
|
|
# Local Variables:
|
|
# compile-command: "cd build ; make -j4 && \
|
|
# ../qa/run-standalone.sh osd-recovery-scrub.sh"
|
|
# End:
|