ceph/qa/standalone/scrub/osd-scrub-snaps.sh

1192 lines
33 KiB
Bash
Executable File

#!/usr/bin/env bash
#
# Copyright (C) 2015 Red Hat <contact@redhat.com>
#
# Author: David Zafman <dzafman@redhat.com>
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU Library Public License as published by
# the Free Software Foundation; either version 2, or (at your option)
# any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU Library Public License for more details.
#
source $CEPH_ROOT/qa/standalone/ceph-helpers.sh
# Test development and debugging
# Set to "yes" in order to ignore diff errors and save results to update test
getjson="no"
jqfilter='.inconsistents'
sortkeys='import json; import sys ; JSON=sys.stdin.read() ; ud = json.loads(JSON) ; print ( json.dumps(ud, sort_keys=True, indent=2) )'
function run() {
local dir=$1
shift
export CEPH_MON="127.0.0.1:7121" # git grep '\<7121\>' : there must be only one
export CEPH_ARGS
CEPH_ARGS+="--fsid=$(uuidgen) --auth-supported=none "
CEPH_ARGS+="--mon-host=$CEPH_MON "
export -n CEPH_CLI_TEST_DUP_COMMAND
local funcs=${@:-$(set | sed -n -e 's/^\(TEST_[0-9a-z_]*\) .*/\1/p')}
for func in $funcs ; do
setup $dir || return 1
$func $dir || return 1
teardown $dir || return 1
done
}
function create_scenario() {
local dir=$1
local poolname=$2
local TESTDATA=$3
local osd=$4
SNAP=1
rados -p $poolname mksnap snap${SNAP}
dd if=/dev/urandom of=$TESTDATA bs=256 count=${SNAP}
rados -p $poolname put obj1 $TESTDATA
rados -p $poolname put obj5 $TESTDATA
rados -p $poolname put obj3 $TESTDATA
for i in `seq 6 14`
do rados -p $poolname put obj${i} $TESTDATA
done
SNAP=2
rados -p $poolname mksnap snap${SNAP}
dd if=/dev/urandom of=$TESTDATA bs=256 count=${SNAP}
rados -p $poolname put obj5 $TESTDATA
SNAP=3
rados -p $poolname mksnap snap${SNAP}
dd if=/dev/urandom of=$TESTDATA bs=256 count=${SNAP}
rados -p $poolname put obj3 $TESTDATA
SNAP=4
rados -p $poolname mksnap snap${SNAP}
dd if=/dev/urandom of=$TESTDATA bs=256 count=${SNAP}
rados -p $poolname put obj5 $TESTDATA
rados -p $poolname put obj2 $TESTDATA
SNAP=5
rados -p $poolname mksnap snap${SNAP}
SNAP=6
rados -p $poolname mksnap snap${SNAP}
dd if=/dev/urandom of=$TESTDATA bs=256 count=${SNAP}
rados -p $poolname put obj5 $TESTDATA
SNAP=7
rados -p $poolname mksnap snap${SNAP}
rados -p $poolname rm obj4
rados -p $poolname rm obj16
rados -p $poolname rm obj2
kill_daemons $dir TERM osd || return 1
# Don't need to use ceph_objectstore_tool() function because osd stopped
JSON="$(ceph-objectstore-tool --data-path $dir/${osd} --head --op list obj1)"
ceph-objectstore-tool --data-path $dir/${osd} "$JSON" --force remove || return 1
JSON="$(ceph-objectstore-tool --data-path $dir/${osd} --op list obj5 | grep \"snapid\":2)"
ceph-objectstore-tool --data-path $dir/${osd} "$JSON" remove || return 1
JSON="$(ceph-objectstore-tool --data-path $dir/${osd} --op list obj5 | grep \"snapid\":1)"
OBJ5SAVE="$JSON"
# Starts with a snapmap
ceph-kvstore-tool bluestore-kv $dir/${osd} list 2> /dev/null > $dir/drk.log
grep SNA_ $dir/drk.log
grep "^[pm].*SNA_.*[.]1[.]obj5[.][.]$" $dir/drk.log || return 1
ceph-objectstore-tool --data-path $dir/${osd} --rmtype nosnapmap "$JSON" remove || return 1
# Check that snapmap is stil there
ceph-kvstore-tool bluestore-kv $dir/${osd} list 2> /dev/null > $dir/drk.log
grep SNA_ $dir/drk.log
grep "^[pm].*SNA_.*[.]1[.]obj5[.][.]$" $dir/drk.log || return 1
rm -f $dir/drk.log
JSON="$(ceph-objectstore-tool --data-path $dir/${osd} --op list obj5 | grep \"snapid\":4)"
dd if=/dev/urandom of=$TESTDATA bs=256 count=18
ceph-objectstore-tool --data-path $dir/${osd} "$JSON" set-bytes $TESTDATA || return 1
JSON="$(ceph-objectstore-tool --data-path $dir/${osd} --head --op list obj3)"
dd if=/dev/urandom of=$TESTDATA bs=256 count=15
ceph-objectstore-tool --data-path $dir/${osd} "$JSON" set-bytes $TESTDATA || return 1
JSON="$(ceph-objectstore-tool --data-path $dir/${osd} --op list obj4 | grep \"snapid\":7)"
ceph-objectstore-tool --data-path $dir/${osd} "$JSON" remove || return 1
# Starts with a snapmap
ceph-kvstore-tool bluestore-kv $dir/${osd} list 2> /dev/null > $dir/drk.log
grep SNA_ $dir/drk.log
grep "^[pm].*SNA_.*[.]7[.]obj16[.][.]$" $dir/drk.log || return 1
JSON="$(ceph-objectstore-tool --data-path $dir/${osd} --op list obj16 | grep \"snapid\":7)"
ceph-objectstore-tool --data-path $dir/${osd} --rmtype snapmap "$JSON" remove || return 1
# Check that snapmap is now removed
ceph-kvstore-tool bluestore-kv $dir/${osd} list 2> /dev/null > $dir/drk.log
grep SNA_ $dir/drk.log
! grep "^[pm].*SNA_.*[.]7[.]obj16[.][.]$" $dir/drk.log || return 1
rm -f $dir/drk.log
JSON="$(ceph-objectstore-tool --data-path $dir/${osd} --head --op list obj2)"
ceph-objectstore-tool --data-path $dir/${osd} "$JSON" rm-attr snapset || return 1
# Create a clone which isn't in snapset and doesn't have object info
JSON="$(echo "$OBJ5SAVE" | sed s/snapid\":1/snapid\":7/)"
dd if=/dev/urandom of=$TESTDATA bs=256 count=7
ceph-objectstore-tool --data-path $dir/${osd} "$JSON" set-bytes $TESTDATA || return 1
JSON="$(ceph-objectstore-tool --data-path $dir/${osd} --head --op list obj6)"
ceph-objectstore-tool --data-path $dir/${osd} "$JSON" clear-snapset || return 1
JSON="$(ceph-objectstore-tool --data-path $dir/${osd} --head --op list obj7)"
ceph-objectstore-tool --data-path $dir/${osd} "$JSON" clear-snapset corrupt || return 1
JSON="$(ceph-objectstore-tool --data-path $dir/${osd} --head --op list obj8)"
ceph-objectstore-tool --data-path $dir/${osd} "$JSON" clear-snapset seq || return 1
JSON="$(ceph-objectstore-tool --data-path $dir/${osd} --head --op list obj9)"
ceph-objectstore-tool --data-path $dir/${osd} "$JSON" clear-snapset clone_size || return 1
JSON="$(ceph-objectstore-tool --data-path $dir/${osd} --head --op list obj10)"
ceph-objectstore-tool --data-path $dir/${osd} "$JSON" clear-snapset clone_overlap || return 1
JSON="$(ceph-objectstore-tool --data-path $dir/${osd} --head --op list obj11)"
ceph-objectstore-tool --data-path $dir/${osd} "$JSON" clear-snapset clones || return 1
JSON="$(ceph-objectstore-tool --data-path $dir/${osd} --head --op list obj12)"
ceph-objectstore-tool --data-path $dir/${osd} "$JSON" clear-snapset head || return 1
JSON="$(ceph-objectstore-tool --data-path $dir/${osd} --head --op list obj13)"
ceph-objectstore-tool --data-path $dir/${osd} "$JSON" clear-snapset snaps || return 1
JSON="$(ceph-objectstore-tool --data-path $dir/${osd} --head --op list obj14)"
ceph-objectstore-tool --data-path $dir/${osd} "$JSON" clear-snapset size || return 1
echo "garbage" > $dir/bad
JSON="$(ceph-objectstore-tool --data-path $dir/${osd} --head --op list obj15)"
ceph-objectstore-tool --data-path $dir/${osd} "$JSON" set-attr snapset $dir/bad || return 1
rm -f $dir/bad
return 0
}
function TEST_scrub_snaps() {
local dir=$1
local poolname=test
local OBJS=16
local OSDS=1
TESTDATA="testdata.$$"
run_mon $dir a --osd_pool_default_size=$OSDS || return 1
run_mgr $dir x || return 1
for osd in $(seq 0 $(expr $OSDS - 1))
do
run_osd $dir $osd || return 1
done
# All scrubs done manually. Don't want any unexpected scheduled scrubs.
ceph osd set noscrub || return 1
ceph osd set nodeep-scrub || return 1
# Create a pool with a single pg
create_pool $poolname 1 1
wait_for_clean || return 1
poolid=$(ceph osd dump | grep "^pool.*[']test[']" | awk '{ print $2 }')
dd if=/dev/urandom of=$TESTDATA bs=1032 count=1
for i in `seq 1 $OBJS`
do
rados -p $poolname put obj${i} $TESTDATA
done
local primary=$(get_primary $poolname obj1)
create_scenario $dir $poolname $TESTDATA $primary || return 1
rm -f $TESTDATA
for osd in $(seq 0 $(expr $OSDS - 1))
do
activate_osd $dir $osd || return 1
done
ceph tell osd.* config set osd_shallow_scrub_chunk_max 25
ceph tell osd.* config set osd_shallow_scrub_chunk_min 5
ceph tell osd.* config set osd_pg_stat_report_interval_max_seconds 1
ceph tell osd.* config set osd_pg_stat_report_interval_max_epochs 1
wait_for_clean || return 1
ceph tell osd.* config get osd_shallow_scrub_chunk_max
ceph tell osd.* config get osd_shallow_scrub_chunk_min
ceph tell osd.* config get osd_pg_stat_report_interval_max_seconds
ceph tell osd.* config get osd_pg_stat_report_interval_max_epochs
ceph tell osd.* config get osd_scrub_chunk_max
ceph tell osd.* config get osd_scrub_chunk_min
local pgid="${poolid}.0"
if ! pg_scrub "$pgid" ; then
return 1
fi
test "$(grep "_scan_snaps start" $dir/osd.${primary}.log | wc -l)" = "2" || return 1
rados list-inconsistent-pg $poolname > $dir/json || return 1
# Check pg count
test $(jq '. | length' $dir/json) = "1" || return 1
# Check pgid
test $(jq -r '.[0]' $dir/json) = $pgid || return 1
rados list-inconsistent-obj $pgid > $dir/json || return 1
# The injected snapshot errors with a single copy pool doesn't
# see object errors because all the issues are detected by
# comparing copies.
jq "$jqfilter" << EOF | python3 -c "$sortkeys" > $dir/checkcsjson
{
"epoch": 17,
"inconsistents": []
}
EOF
jq "$jqfilter" $dir/json | python3 -c "$sortkeys" > $dir/csjson
multidiff $dir/checkcsjson $dir/csjson || test $getjson = "yes" || return 1
rados list-inconsistent-snapset $pgid > $dir/json || return 1
jq "$jqfilter" << EOF | python3 -c "$sortkeys" > $dir/checkcsjson
{
"inconsistents": [
{
"errors": [
"headless"
],
"snap": 1,
"locator": "",
"nspace": "",
"name": "obj1"
},
{
"errors": [
"size_mismatch"
],
"snap": 1,
"locator": "",
"nspace": "",
"name": "obj10"
},
{
"errors": [
"headless"
],
"snap": 1,
"locator": "",
"nspace": "",
"name": "obj11"
},
{
"errors": [
"size_mismatch"
],
"snap": 1,
"locator": "",
"nspace": "",
"name": "obj14"
},
{
"errors": [
"headless"
],
"snap": 1,
"locator": "",
"nspace": "",
"name": "obj6"
},
{
"errors": [
"headless"
],
"snap": 1,
"locator": "",
"nspace": "",
"name": "obj7"
},
{
"errors": [
"size_mismatch"
],
"snap": 1,
"locator": "",
"nspace": "",
"name": "obj9"
},
{
"errors": [
"headless"
],
"snap": 4,
"locator": "",
"nspace": "",
"name": "obj2"
},
{
"errors": [
"size_mismatch"
],
"snap": 4,
"locator": "",
"nspace": "",
"name": "obj5"
},
{
"errors": [
"headless"
],
"snap": 7,
"locator": "",
"nspace": "",
"name": "obj2"
},
{
"errors": [
"info_missing",
"headless"
],
"snap": 7,
"locator": "",
"nspace": "",
"name": "obj5"
},
{
"name": "obj10",
"nspace": "",
"locator": "",
"snap": "head",
"snapset": {
"seq": 1,
"clones": [
{
"snap": 1,
"size": 1032,
"overlap": "????",
"snaps": [
1
]
}
]
},
"errors": []
},
{
"extra clones": [
1
],
"errors": [
"extra_clones"
],
"snap": "head",
"locator": "",
"nspace": "",
"name": "obj11",
"snapset": {
"seq": 1,
"clones": []
}
},
{
"name": "obj14",
"nspace": "",
"locator": "",
"snap": "head",
"snapset": {
"seq": 1,
"clones": [
{
"snap": 1,
"size": 1033,
"overlap": "[]",
"snaps": [
1
]
}
]
},
"errors": []
},
{
"errors": [
"snapset_corrupted"
],
"snap": "head",
"locator": "",
"nspace": "",
"name": "obj15"
},
{
"extra clones": [
7,
4
],
"errors": [
"snapset_missing",
"extra_clones"
],
"snap": "head",
"locator": "",
"nspace": "",
"name": "obj2"
},
{
"errors": [
"size_mismatch"
],
"snap": "head",
"locator": "",
"nspace": "",
"name": "obj3",
"snapset": {
"seq": 3,
"clones": [
{
"snap": 1,
"size": 1032,
"overlap": "[]",
"snaps": [
1
]
},
{
"snap": 3,
"size": 256,
"overlap": "[]",
"snaps": [
3,
2
]
}
]
}
},
{
"missing": [
7
],
"errors": [
"clone_missing"
],
"snap": "head",
"locator": "",
"nspace": "",
"name": "obj4",
"snapset": {
"seq": 7,
"clones": [
{
"snap": 7,
"size": 1032,
"overlap": "[]",
"snaps": [
7,
6,
5,
4,
3,
2,
1
]
}
]
}
},
{
"missing": [
2,
1
],
"extra clones": [
7
],
"errors": [
"extra_clones",
"clone_missing"
],
"snap": "head",
"locator": "",
"nspace": "",
"name": "obj5",
"snapset": {
"seq": 6,
"clones": [
{
"snap": 1,
"size": 1032,
"overlap": "[]",
"snaps": [
1
]
},
{
"snap": 2,
"size": 256,
"overlap": "[]",
"snaps": [
2
]
},
{
"snap": 4,
"size": 512,
"overlap": "[]",
"snaps": [
4,
3
]
},
{
"snap": 6,
"size": 1024,
"overlap": "[]",
"snaps": [
6,
5
]
}
]
}
},
{
"extra clones": [
1
],
"errors": [
"extra_clones"
],
"snap": "head",
"locator": "",
"nspace": "",
"name": "obj6",
"snapset": {
"seq": 1,
"clones": []
}
},
{
"extra clones": [
1
],
"errors": [
"extra_clones"
],
"snap": "head",
"locator": "",
"nspace": "",
"name": "obj7",
"snapset": {
"seq": 0,
"clones": []
}
},
{
"errors": [
"snapset_error"
],
"snap": "head",
"locator": "",
"nspace": "",
"name": "obj8",
"snapset": {
"seq": 0,
"clones": [
{
"snap": 1,
"size": 1032,
"overlap": "[]",
"snaps": [
1
]
}
]
}
},
{
"name": "obj9",
"nspace": "",
"locator": "",
"snap": "head",
"snapset": {
"seq": 1,
"clones": [
{
"snap": 1,
"size": "????",
"overlap": "[]",
"snaps": [
1
]
}
]
},
"errors": []
}
],
"epoch": 20
}
EOF
jq "$jqfilter" $dir/json | python3 -c "$sortkeys" > $dir/csjson
multidiff $dir/checkcsjson $dir/csjson || test $getjson = "yes" || return 1
if test $getjson = "yes"
then
jq '.' $dir/json > save1.json
fi
if test "$LOCALRUN" = "yes" && which jsonschema > /dev/null;
then
jsonschema -i $dir/json $CEPH_ROOT/doc/rados/command/list-inconsistent-snap.json || return 1
fi
pidfiles=$(find $dir 2>/dev/null | grep 'osd[^/]*\.pid')
pids=""
for pidfile in ${pidfiles}
do
pids+="$(cat $pidfile) "
done
ERRORS=0
for i in `seq 1 7`
do
rados -p $poolname rmsnap snap$i
done
sleep 5
local -i loop=0
while ceph pg dump pgs | grep -q snaptrim;
do
if ceph pg dump pgs | grep -q snaptrim_error;
then
break
fi
sleep 2
loop+=1
if (( $loop >= 10 )) ; then
ERRORS=$(expr $ERRORS + 1)
break
fi
done
ceph pg dump pgs
for pid in $pids
do
if ! kill -0 $pid
then
echo "OSD Crash occurred"
ERRORS=$(expr $ERRORS + 1)
fi
done
kill_daemons $dir || return 1
declare -a err_strings
err_strings[0]="log_channel[(]cluster[)] log [[]ERR[]] : scrub [0-9]*[.]0 .*::obj10:.* : is missing in clone_overlap"
err_strings[1]="log_channel[(]cluster[)] log [[]ERR[]] : scrub [0-9]*[.]0 .*::obj5:7 : no '_' attr"
err_strings[2]="log_channel[(]cluster[)] log [[]ERR[]] : scrub [0-9]*[.]0 .*::obj5:7 : is an unexpected clone"
err_strings[3]="log_channel[(]cluster[)] log [[]ERR[]] : scrub [0-9]*[.]0 .*::obj5:4 : on disk size [(]4608[)] does not match object info size [(]512[)] adjusted for ondisk to [(]512[)]"
err_strings[4]="log_channel[(]cluster[)] log [[]ERR[]] : scrub [0-9]*[.]0 .*:::obj5:head : expected clone .*:::obj5:2"
err_strings[5]="log_channel[(]cluster[)] log [[]ERR[]] : scrub [0-9]*[.]0 .*:::obj5:head : expected clone .*:::obj5:1"
err_strings[6]="log_channel[(]cluster[)] log [[]INF[]] : scrub [0-9]*[.]0 .*:::obj5:head : 2 missing clone[(]s[)]"
err_strings[7]="log_channel[(]cluster[)] log [[]ERR[]] : scrub [0-9]*[.]0 .*:::obj8:head : snaps.seq not set"
err_strings[8]="log_channel[(]cluster[)] log [[]ERR[]] : scrub [0-9]*[.]0 .*:::obj7:1 : is an unexpected clone"
err_strings[9]="log_channel[(]cluster[)] log [[]ERR[]] : scrub [0-9]*[.]0 .*:::obj3:head : on disk size [(]3840[)] does not match object info size [(]768[)] adjusted for ondisk to [(]768[)]"
err_strings[10]="log_channel[(]cluster[)] log [[]ERR[]] : scrub [0-9]*[.]0 .*:::obj6:1 : is an unexpected clone"
err_strings[11]="log_channel[(]cluster[)] log [[]ERR[]] : scrub [0-9]*[.]0 .*:::obj2:head : no 'snapset' attr"
err_strings[12]="log_channel[(]cluster[)] log [[]ERR[]] : scrub [0-9]*[.]0 .*:::obj2:7 : clone ignored due to missing snapset"
err_strings[13]="log_channel[(]cluster[)] log [[]ERR[]] : scrub [0-9]*[.]0 .*:::obj2:4 : clone ignored due to missing snapset"
err_strings[14]="log_channel[(]cluster[)] log [[]ERR[]] : scrub [0-9]*[.]0 .*:::obj4:head : expected clone .*:::obj4:7"
err_strings[15]="log_channel[(]cluster[)] log [[]INF[]] : scrub [0-9]*[.]0 .*:::obj4:head : 1 missing clone[(]s[)]"
err_strings[16]="log_channel[(]cluster[)] log [[]ERR[]] : scrub [0-9]*[.]0 .*:::obj1:1 : is an unexpected clone"
err_strings[17]="log_channel[(]cluster[)] log [[]ERR[]] : scrub [0-9]*[.]0 .*:::obj9:1 : is missing in clone_size"
err_strings[18]="log_channel[(]cluster[)] log [[]ERR[]] : scrub [0-9]*[.]0 .*:::obj11:1 : is an unexpected clone"
err_strings[19]="log_channel[(]cluster[)] log [[]ERR[]] : scrub [0-9]*[.]0 .*:::obj14:1 : size 1032 != clone_size 1033"
err_strings[20]="log_channel[(]cluster[)] log [[]ERR[]] : [0-9]*[.]0 scrub 20 errors"
err_strings[21]="log_channel[(]cluster[)] log [[]ERR[]] : scrub [0-9]*[.]0 .*:::obj15:head : can't decode 'snapset' attr "
err_strings[22]="log_channel[(]cluster[)] log [[]ERR[]] : osd[.][0-9]* found snap mapper error on pg 1.0 oid 1:461f8b5e:::obj16:7 snaps missing in mapper, should be: {1, 2, 3, 4, 5, 6, 7} ...repaired"
for err_string in "${err_strings[@]}"
do
if ! grep "$err_string" $dir/osd.${primary}.log > /dev/null;
then
echo "Missing log message '$err_string'"
ERRORS=$(expr $ERRORS + 1)
fi
done
if [ $ERRORS != "0" ];
then
echo "TEST FAILED WITH $ERRORS ERRORS"
return 1
fi
echo "TEST PASSED"
return 0
}
function _scrub_snaps_multi() {
local dir=$1
local poolname=test
local OBJS=16
local OSDS=2
local which=$2
TESTDATA="testdata.$$"
run_mon $dir a --osd_pool_default_size=$OSDS || return 1
run_mgr $dir x || return 1
for osd in $(seq 0 $(expr $OSDS - 1))
do
run_osd $dir $osd || return 1
done
# All scrubs done manually. Don't want any unexpected scheduled scrubs.
ceph osd set noscrub || return 1
ceph osd set nodeep-scrub || return 1
# Create a pool with a single pg
create_pool $poolname 1 1
wait_for_clean || return 1
poolid=$(ceph osd dump | grep "^pool.*[']test[']" | awk '{ print $2 }')
dd if=/dev/urandom of=$TESTDATA bs=1032 count=1
for i in `seq 1 $OBJS`
do
rados -p $poolname put obj${i} $TESTDATA
done
local primary=$(get_primary $poolname obj1)
local replica=$(get_not_primary $poolname obj1)
eval create_scenario $dir $poolname $TESTDATA \$$which || return 1
rm -f $TESTDATA
for osd in $(seq 0 $(expr $OSDS - 1))
do
activate_osd $dir $osd || return 1
done
ceph tell osd.* config set osd_shallow_scrub_chunk_max 3
ceph tell osd.* config set osd_shallow_scrub_chunk_min 3
ceph tell osd.* config set osd_scrub_chunk_min 3
ceph tell osd.* config set osd_pg_stat_report_interval_max_seconds 1
ceph tell osd.* config set osd_pg_stat_report_interval_max_epochs 1
wait_for_clean || return 1
local pgid="${poolid}.0"
if ! pg_scrub "$pgid" ; then
return 1
fi
test "$(grep "_scan_snaps start" $dir/osd.${primary}.log | wc -l)" -gt "3" || return 1
test "$(grep "_scan_snaps start" $dir/osd.${replica}.log | wc -l)" -gt "3" || return 1
rados list-inconsistent-pg $poolname > $dir/json || return 1
# Check pg count
test $(jq '. | length' $dir/json) = "1" || return 1
# Check pgid
test $(jq -r '.[0]' $dir/json) = $pgid || return 1
rados list-inconsistent-obj $pgid --format=json-pretty
rados list-inconsistent-snapset $pgid > $dir/json || return 1
# Since all of the snapshots on the primary is consistent there are no errors here
if [ $which = "replica" ];
then
scruberrors="20"
jq "$jqfilter" << EOF | python3 -c "$sortkeys" > $dir/checkcsjson
{
"epoch": 23,
"inconsistents": []
}
EOF
else
scruberrors="30"
jq "$jqfilter" << EOF | python3 -c "$sortkeys" > $dir/checkcsjson
{
"epoch": 23,
"inconsistents": [
{
"name": "obj10",
"nspace": "",
"locator": "",
"snap": 1,
"errors": [
"size_mismatch"
]
},
{
"name": "obj11",
"nspace": "",
"locator": "",
"snap": 1,
"errors": [
"headless"
]
},
{
"name": "obj14",
"nspace": "",
"locator": "",
"snap": 1,
"errors": [
"size_mismatch"
]
},
{
"name": "obj6",
"nspace": "",
"locator": "",
"snap": 1,
"errors": [
"headless"
]
},
{
"name": "obj7",
"nspace": "",
"locator": "",
"snap": 1,
"errors": [
"headless"
]
},
{
"name": "obj9",
"nspace": "",
"locator": "",
"snap": 1,
"errors": [
"size_mismatch"
]
},
{
"name": "obj5",
"nspace": "",
"locator": "",
"snap": 7,
"errors": [
"info_missing",
"headless"
]
},
{
"name": "obj10",
"nspace": "",
"locator": "",
"snap": "head",
"snapset": {
"seq": 1,
"clones": [
{
"snap": 1,
"size": 1032,
"overlap": "????",
"snaps": [
1
]
}
]
},
"errors": []
},
{
"name": "obj11",
"nspace": "",
"locator": "",
"snap": "head",
"snapset": {
"seq": 1,
"clones": []
},
"errors": [
"extra_clones"
],
"extra clones": [
1
]
},
{
"name": "obj14",
"nspace": "",
"locator": "",
"snap": "head",
"snapset": {
"seq": 1,
"clones": [
{
"snap": 1,
"size": 1033,
"overlap": "[]",
"snaps": [
1
]
}
]
},
"errors": []
},
{
"name": "obj5",
"nspace": "",
"locator": "",
"snap": "head",
"snapset": {
"seq": 6,
"clones": [
{
"snap": 1,
"size": 1032,
"overlap": "[]",
"snaps": [
1
]
},
{
"snap": 2,
"size": 256,
"overlap": "[]",
"snaps": [
2
]
},
{
"snap": 4,
"size": 512,
"overlap": "[]",
"snaps": [
4,
3
]
},
{
"snap": 6,
"size": 1024,
"overlap": "[]",
"snaps": [
6,
5
]
}
]
},
"errors": [
"extra_clones"
],
"extra clones": [
7
]
},
{
"name": "obj6",
"nspace": "",
"locator": "",
"snap": "head",
"snapset": {
"seq": 1,
"clones": []
},
"errors": [
"extra_clones"
],
"extra clones": [
1
]
},
{
"name": "obj7",
"nspace": "",
"locator": "",
"snap": "head",
"snapset": {
"seq": 0,
"clones": []
},
"errors": [
"extra_clones"
],
"extra clones": [
1
]
},
{
"name": "obj8",
"nspace": "",
"locator": "",
"snap": "head",
"snapset": {
"seq": 0,
"clones": [
{
"snap": 1,
"size": 1032,
"overlap": "[]",
"snaps": [
1
]
}
]
},
"errors": [
"snapset_error"
]
},
{
"name": "obj9",
"nspace": "",
"locator": "",
"snap": "head",
"snapset": {
"seq": 1,
"clones": [
{
"snap": 1,
"size": "????",
"overlap": "[]",
"snaps": [
1
]
}
]
},
"errors": []
}
]
}
EOF
fi
jq "$jqfilter" $dir/json | python3 -c "$sortkeys" > $dir/csjson
multidiff $dir/checkcsjson $dir/csjson || test $getjson = "yes" || return 1
if test $getjson = "yes"
then
jq '.' $dir/json > save1.json
fi
if test "$LOCALRUN" = "yes" && which jsonschema > /dev/null;
then
jsonschema -i $dir/json $CEPH_ROOT/doc/rados/command/list-inconsistent-snap.json || return 1
fi
pidfiles=$(find $dir 2>/dev/null | grep 'osd[^/]*\.pid')
pids=""
for pidfile in ${pidfiles}
do
pids+="$(cat $pidfile) "
done
ERRORS=0
# When removing snapshots with a corrupt replica, it crashes.
# See http://tracker.ceph.com/issues/23875
if [ $which = "primary" ];
then
for i in `seq 1 7`
do
rados -p $poolname rmsnap snap$i
done
sleep 5
local -i loop=0
while ceph pg dump pgs | grep -q snaptrim;
do
if ceph pg dump pgs | grep -q snaptrim_error;
then
break
fi
sleep 2
loop+=1
if (( $loop >= 10 )) ; then
ERRORS=$(expr $ERRORS + 1)
break
fi
done
fi
ceph pg dump pgs
for pid in $pids
do
if ! kill -0 $pid
then
echo "OSD Crash occurred"
ERRORS=$(expr $ERRORS + 1)
fi
done
kill_daemons $dir || return 1
declare -a err_strings
err_strings[0]="log_channel[(]cluster[)] log [[]ERR[]] : [0-9]*[.]0 shard [0-1] .*:::obj4:7 : missing"
err_strings[1]="log_channel[(]cluster[)] log [[]ERR[]] : [0-9]*[.]0 shard [0-1] soid .*:::obj3:head : size 3840 != size 768 from auth oi"
err_strings[2]="log_channel[(]cluster[)] log [[]ERR[]] : [0-9]*[.]0 shard [0-1] .*:::obj5:1 : missing"
err_strings[3]="log_channel[(]cluster[)] log [[]ERR[]] : [0-9]*[.]0 shard [0-1] .*:::obj5:2 : missing"
err_strings[4]="log_channel[(]cluster[)] log [[]ERR[]] : [0-9]*[.]0 shard [0-1] soid .*:::obj5:4 : size 4608 != size 512 from auth oi"
err_strings[5]="log_channel[(]cluster[)] log [[]ERR[]] : [0-9]*[.]0 soid .*:::obj5:7 : failed to pick suitable object info"
err_strings[6]="log_channel[(]cluster[)] log [[]ERR[]] : [0-9]*[.]0 shard [0-1] .*:::obj1:head : missing"
err_strings[7]="log_channel[(]cluster[)] log [[]ERR[]] : [0-9]*[.]0 scrub ${scruberrors} errors"
for err_string in "${err_strings[@]}"
do
if ! grep "$err_string" $dir/osd.${primary}.log > /dev/null;
then
echo "Missing log message '$err_string'"
ERRORS=$(expr $ERRORS + 1)
fi
done
# Check replica specific messages
declare -a rep_err_strings
osd=$(eval echo \$$which)
rep_err_strings[0]="log_channel[(]cluster[)] log [[]ERR[]] : osd[.][0-9]* found snap mapper error on pg 1.0 oid 1:461f8b5e:::obj16:7 snaps missing in mapper, should be: {1, 2, 3, 4, 5, 6, 7} ...repaired"
for err_string in "${rep_err_strings[@]}"
do
if ! grep "$err_string" $dir/osd.${osd}.log > /dev/null;
then
echo "Missing log message '$err_string'"
ERRORS=$(expr $ERRORS + 1)
fi
done
if [ $ERRORS != "0" ];
then
echo "TEST FAILED WITH $ERRORS ERRORS"
return 1
fi
echo "TEST PASSED"
return 0
}
function TEST_scrub_snaps_replica() {
local dir=$1
ORIG_ARGS=$CEPH_ARGS
CEPH_ARGS+=" --osd_scrub_chunk_min=3 --osd_scrub_chunk_max=20 --osd_shallow_scrub_chunk_min=3 --osd_shallow_scrub_chunk_max=3 --osd_pg_stat_report_interval_max_seconds=1 --osd_pg_stat_report_interval_max_epochs=1"
_scrub_snaps_multi $dir replica
err=$?
CEPH_ARGS=$ORIG_ARGS
return $err
}
function TEST_scrub_snaps_primary() {
local dir=$1
ORIG_ARGS=$CEPH_ARGS
CEPH_ARGS+=" --osd_scrub_chunk_min=3 --osd_scrub_chunk_max=20 --osd_shallow_scrub_chunk_min=3 --osd_shallow_scrub_chunk_max=3 --osd_pg_stat_report_interval_max_seconds=1 --osd_pg_stat_report_interval_max_epochs=1"
_scrub_snaps_multi $dir primary
err=$?
CEPH_ARGS=$ORIG_ARGS
return $err
}
main osd-scrub-snaps "$@"
# Local Variables:
# compile-command: "cd build ; make -j4 && \
# ../qa/run-standalone.sh osd-scrub-snaps.sh"
# End: