mirror of
https://github.com/ceph/ceph
synced 2025-01-20 10:01:45 +00:00
Merge pull request #4174 from trociny/wip-10976.master
osd: fix PG::all_unfound_are_queried_or_lost for non-existent osds Reviewed-by: Kefu Chai <tchaikov@gmail.com Reviewed-by: Samuel Just <sjust@redhat.com>
This commit is contained in:
commit
aec2f5de3b
@ -792,6 +792,8 @@ bool PG::all_unfound_are_queried_or_lost(const OSDMapRef osdmap) const
|
||||
if (iter != peer_info.end() &&
|
||||
(iter->second.is_empty() || iter->second.dne()))
|
||||
continue;
|
||||
if (!osdmap->exists(peer->osd))
|
||||
continue;
|
||||
const osd_info_t &osd_info(osdmap->get_info(peer->osd));
|
||||
if (osd_info.lost_at <= osd_info.up_from) {
|
||||
// If there is even one OSD in might_have_unfound that isn't lost, we
|
||||
|
@ -38,12 +38,33 @@ die() {
|
||||
exit 1
|
||||
}
|
||||
|
||||
# Test that flag is set (the element is found in the list)
|
||||
is_set()
|
||||
{
|
||||
local flag=$1; shift
|
||||
local flags="$@"
|
||||
local i
|
||||
|
||||
for i in ${flags}; do
|
||||
if [ "${flag}" = "${i}" ]; then
|
||||
return 0
|
||||
fi
|
||||
done
|
||||
return 1
|
||||
}
|
||||
|
||||
# Stop an OSD started by vstart
|
||||
stop_osd() {
|
||||
osd_index=$1
|
||||
pidfile="out/osd.$osd_index.pid"
|
||||
if [ -e $pidfile ]; then
|
||||
kill `cat $pidfile` && return 0
|
||||
if kill `cat $pidfile` ; then
|
||||
poll_cmd "eval test -e $pidfile ; echo \$?" "1" 1 30
|
||||
[ $? -eq 1 ] && return 0
|
||||
echo "ceph-osd process did not terminate correctly"
|
||||
else
|
||||
echo "kill `cat $pidfile` failed"
|
||||
fi
|
||||
else
|
||||
echo "ceph-osd process $osd_index is not running"
|
||||
fi
|
||||
@ -144,7 +165,7 @@ start_recovery() {
|
||||
CEPH_NUM_OSD=$1
|
||||
osd=0
|
||||
while [ $osd -lt $CEPH_NUM_OSD ]; do
|
||||
./ceph -c ./ceph.conf osd tell $osd debug kick_recovery_wq 0
|
||||
./ceph -c ./ceph.conf tell osd.$osd debug kick_recovery_wq 0
|
||||
osd=$((osd+1))
|
||||
done
|
||||
}
|
||||
|
@ -19,6 +19,26 @@ setup() {
|
||||
|
||||
# set recovery start to a really long time to ensure that we don't start recovery
|
||||
./vstart.sh -d -n -o "$vstart_config" || die "vstart failed"
|
||||
|
||||
# for exiting pools set size not greater than number of OSDs,
|
||||
# so recovery from degraded ps is possible
|
||||
local changed=0
|
||||
for pool in `./ceph osd pool ls`; do
|
||||
local size=`./ceph osd pool get ${pool} size | awk '{print $2}'`
|
||||
if [ "${size}" -gt "${CEPH_NUM_OSD}" ]; then
|
||||
./ceph osd pool set ${pool} size ${CEPH_NUM_OSD}
|
||||
changed=1
|
||||
fi
|
||||
done
|
||||
if [ ${changed} -eq 1 ]; then
|
||||
# XXX: When a pool has degraded pgs due to size greater than number
|
||||
# of OSDs, after decreasing the size the recovery still could stuck
|
||||
# and requires an additional kick.
|
||||
./ceph osd out 0
|
||||
./ceph osd in 0
|
||||
fi
|
||||
|
||||
poll_cmd "./ceph health" HEALTH_OK 1 30
|
||||
}
|
||||
|
||||
recovery1_impl() {
|
||||
@ -65,7 +85,13 @@ recovery1() {
|
||||
}
|
||||
|
||||
lost1_impl() {
|
||||
try_to_fetch_unfound=$1
|
||||
local flags="$@"
|
||||
local lost_action=delete
|
||||
local pgs_unfound pg
|
||||
|
||||
if is_set revert_lost $flags; then
|
||||
lost_action=revert
|
||||
fi
|
||||
|
||||
# Write lots and lots of objects
|
||||
write_objects 1 1 20 8000 $TEST_POOL
|
||||
@ -91,7 +117,20 @@ lost1_impl() {
|
||||
poll_cmd "./ceph pg debug unfound_objects_exist" TRUE 3 120
|
||||
[ $? -eq 1 ] || die "Failed to see unfound objects."
|
||||
|
||||
if [ "$try_to_fetch_unfound" -eq 1 ]; then
|
||||
pgs_unfound=`./ceph health detail |awk '$1 = "pg" && /[0-9] unfound$/ {print $2}'`
|
||||
|
||||
[ -n "$pgs_unfound" ] || die "no pg with unfound objects"
|
||||
|
||||
for pg in $pgs_unfound; do
|
||||
./ceph pg $pg mark_unfound_lost revert &&
|
||||
die "mark_unfound_lost unexpectedly succeeded for pg $pg"
|
||||
done
|
||||
|
||||
if ! is_set mark_osd_lost $flags && ! is_set rm_osd $flags; then
|
||||
return
|
||||
fi
|
||||
|
||||
if is_set try_to_fetch_unfound $flags; then
|
||||
# Ask for an object while it's still unfound, and
|
||||
# verify we get woken to an error when it's declared lost.
|
||||
echo "trying to get one of the unfound objects"
|
||||
@ -101,19 +140,43 @@ lost1_impl() {
|
||||
) &
|
||||
fi
|
||||
|
||||
# Lose all objects.
|
||||
./ceph osd lost 0 --yes-i-really-mean-it
|
||||
if is_set mark_osd_lost $flags; then
|
||||
./ceph osd lost 0 --yes-i-really-mean-it
|
||||
fi
|
||||
|
||||
if is_set rm_osd $flags; then
|
||||
./ceph osd rm 0
|
||||
fi
|
||||
|
||||
if ! is_set auto_mark_unfound_lost $flags; then
|
||||
for pg in $pgs_unfound; do
|
||||
./ceph pg $pg mark_unfound_lost ${lost_action} ||
|
||||
die "mark_unfound_lost failed for pg $pg"
|
||||
done
|
||||
fi
|
||||
|
||||
start_recovery 2
|
||||
|
||||
# Unfound objects go away and are turned into lost objects.
|
||||
poll_cmd "./ceph pg debug unfound_objects_exist" FALSE 3 120
|
||||
[ $? -eq 1 ] || die "Unfound objects didn't go away."
|
||||
|
||||
for pg in `ceph pg ls | awk '/^[0-9]/ {print $1}'`; do
|
||||
./ceph pg $pg mark_unfound_lost revert 2>&1 |
|
||||
grep 'pg has no unfound objects' ||
|
||||
die "pg $pg has unfound objects"
|
||||
done
|
||||
|
||||
# Reading from a lost object gives back an error code.
|
||||
# TODO: check error code
|
||||
./rados -c ./ceph.conf -p $TEST_POOL get obj01 $TEMPDIR/obj01 &&\
|
||||
./rados -c ./ceph.conf -p $TEST_POOL get obj01 $TEMPDIR/obj01
|
||||
if [ lost_action = delete -a $? -eq 0 ]; then
|
||||
die "expected radostool error"
|
||||
elif [ lost_action = revert -a $? -ne 0 ]; then
|
||||
die "unexpected radostool error"
|
||||
fi
|
||||
|
||||
if [ "$try_to_fetch_unfound" -eq 1 ]; then
|
||||
if is_set try_to_fetch_unfound $flags; then
|
||||
echo "waiting for the try_to_fetch_unfound \
|
||||
radostool instance to finish"
|
||||
wait
|
||||
@ -122,16 +185,31 @@ radostool instance to finish"
|
||||
|
||||
lost1() {
|
||||
setup 2 'osd recovery delay start = 10000'
|
||||
lost1_impl 0
|
||||
lost1_impl mark_osd_lost revert_lost
|
||||
}
|
||||
|
||||
lost2() {
|
||||
setup 2 'osd recovery delay start = 10000'
|
||||
lost1_impl 1
|
||||
lost1_impl mark_osd_lost try_to_fetch_unfound
|
||||
}
|
||||
|
||||
lost3() {
|
||||
setup 2 'osd recovery delay start = 10000'
|
||||
lost1_impl rm_osd
|
||||
}
|
||||
|
||||
lost4() {
|
||||
setup 2 'osd recovery delay start = 10000'
|
||||
lost1_impl mark_osd_lost rm_osd
|
||||
}
|
||||
|
||||
lost5() {
|
||||
setup 2 'osd recovery delay start = 10000'
|
||||
lost1_impl mark_osd_lost auto_mark_unfound_lost
|
||||
}
|
||||
|
||||
all_osds_die_impl() {
|
||||
poll_cmd "./ceph osd stat -o -" '3 up, 3 in' 20 240
|
||||
poll_cmd "./ceph osd stat" '3 up, 3 in' 20 240
|
||||
[ $? -eq 1 ] || die "didn't start 3 osds"
|
||||
|
||||
stop_osd 0
|
||||
@ -139,7 +217,7 @@ all_osds_die_impl() {
|
||||
stop_osd 2
|
||||
|
||||
# wait for the MOSDPGStat timeout
|
||||
poll_cmd "./ceph osd stat -o -" '0 up' 20 240
|
||||
poll_cmd "./ceph osd stat" '0 up' 20 240
|
||||
[ $? -eq 1 ] || die "all osds weren't marked as down"
|
||||
}
|
||||
|
||||
@ -156,9 +234,24 @@ run() {
|
||||
|
||||
lost1 || die "test failed"
|
||||
|
||||
lost2 || die "test failed"
|
||||
# XXX: try_to_fetch_unfound test currently hangs on "waiting for the
|
||||
# try_to_fetch_unfound radostool instance to finish"
|
||||
#lost2 || die "test failed"
|
||||
|
||||
lost3 || die "test failed"
|
||||
|
||||
lost4 || die "test failed"
|
||||
|
||||
# XXX: automatically marking lost is not implemented
|
||||
#lost5 || die "test failed"
|
||||
|
||||
all_osds_die || die "test failed"
|
||||
}
|
||||
|
||||
if [ -z "$@" ]; then
|
||||
run
|
||||
echo OK
|
||||
exit 0
|
||||
fi
|
||||
|
||||
$@
|
||||
|
Loading…
Reference in New Issue
Block a user