Merge pull request #48209 from kamoltat/wip-ksirivad-fix-tracker-57570

osd/OSDMap: Check for uneven weights & != 2 buckets post stretch mode

Reviewed-by: Greg Farnum <gfarnum@redhat.com>
This commit is contained in:
Yuri Weinstein 2023-06-19 13:29:21 -04:00 committed by GitHub
commit 5ae95880bf
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
5 changed files with 211 additions and 1 deletions

View File

@ -1406,6 +1406,31 @@ other performance issue with the OSDs.
The exact size of the snapshot trim queue is reported by the ``snaptrimq_len``
field of ``ceph pg ls -f json-detail``.
Stretch Mode
------------
INCORRECT_NUM_BUCKETS_STRETCH_MODE
__________________________________
Stretch mode currently only support 2 dividing buckets with OSDs, this warning suggests
that the number of dividing buckets is not equal to 2 after stretch mode is enabled.
You can expect unpredictable failures and MON assertions until the condition is fixed.
We encourage you to fix this by removing additional dividing buckets or bump the
number of dividing buckets to 2.
UNEVEN_WEIGHTS_STRETCH_MODE
___________________________
The 2 dividing buckets must have equal weights when stretch mode is enabled.
This warning suggests that the 2 dividing buckets have uneven weights after
stretch mode is enabled. This is not immediately fatal, however, you can expect
Ceph to be confused when trying to process transitions between dividing buckets.
We encourage you to fix this by making the weights even on both dividing buckets.
This can be done by making sure the combined weight of the OSDs on each dividing
bucket are the same.
Miscellaneous
-------------

View File

@ -1747,6 +1747,29 @@ function test_wait_for_peered() {
#######################################################################
##
# Wait until the cluster's health condition disappeared.
# $TIMEOUT default
#
# @param string to grep for in health detail
# @return 0 if the cluster health doesn't matches request,
# 1 otherwise if after $TIMEOUT seconds health condition remains.
#
function wait_for_health_gone() {
local grepstr=$1
local -a delays=($(get_timeout_delays $TIMEOUT .1))
local -i loop=0
while ceph health detail | grep "$grepstr" ; do
if (( $loop >= ${#delays[*]} )) ; then
ceph health detail
return 1
fi
sleep ${delays[$loop]}
loop+=1
done
}
##
# Wait until the cluster has health condition passed as arg
# again for $TIMEOUT seconds.

View File

@ -144,6 +144,5 @@ EOF
sleep 3
teardown $dir || return 1
}
main mon-stretch-fail-recovery "$@"

View File

@ -0,0 +1,145 @@
#!/usr/bin/env bash
source $CEPH_ROOT/qa/standalone/ceph-helpers.sh
function run() {
local dir=$1
shift
export CEPH_MON_A="127.0.0.1:7139" # git grep '\<7139\>' : there must be only one
export CEPH_MON_B="127.0.0.1:7141" # git grep '\<7141\>' : there must be only one
export CEPH_MON_C="127.0.0.1:7142" # git grep '\<7142\>' : there must be only one
export CEPH_MON_D="127.0.0.1:7143" # git grep '\<7143\>' : there must be only one
export CEPH_MON_E="127.0.0.1:7144" # git grep '\<7144\>' : there must be only one
export CEPH_ARGS
CEPH_ARGS+="--fsid=$(uuidgen) --auth-supported=none "
export BASE_CEPH_ARGS=$CEPH_ARGS
CEPH_ARGS+="--mon-host=$CEPH_MON_A"
local funcs=${@:-$(set | sed -n -e 's/^\(TEST_[0-9a-z_]*\) .*/\1/p')}
for func in $funcs ; do
setup $dir || return 1
$func $dir || return 1
teardown $dir || return 1
done
}
TEST_stretched_cluster_uneven_weight() {
local dir=$1
local OSDS=4
local weight=0.09000
setup $dir || return 1
run_mon $dir a --public-addr $CEPH_MON_A || return 1
wait_for_quorum 300 1 || return 1
run_mon $dir b --public-addr $CEPH_MON_B || return 1
CEPH_ARGS="$BASE_CEPH_ARGS --mon-host=$CEPH_MON_A,$CEPH_MON_B"
wait_for_quorum 300 2 || return 1
run_mon $dir c --public-addr $CEPH_MON_C || return 1
CEPH_ARGS="$BASE_CEPH_ARGS --mon-host=$CEPH_MON_A,$CEPH_MON_B,$CEPH_MON_C"
wait_for_quorum 300 3 || return 1
run_mon $dir d --public-addr $CEPH_MON_D || return 1
CEPH_ARGS="$BASE_CEPH_ARGS --mon-host=$CEPH_MON_A,$CEPH_MON_B,$CEPH_MON_C,$CEPH_MON_D"
wait_for_quorum 300 4 || return 1
run_mon $dir e --public-addr $CEPH_MON_E || return 1
CEPH_ARGS="$BASE_CEPH_ARGS --mon-host=$CEPH_MON_A,$CEPH_MON_B,$CEPH_MON_C,$CEPH_MON_D,$CEPH_MON_E"
wait_for_quorum 300 5 || return 1
ceph mon set election_strategy connectivity
ceph mon add disallowed_leader e
run_mgr $dir x || return 1
run_mgr $dir y || return 1
run_mgr $dir z || return 1
for osd in $(seq 0 $(expr $OSDS - 1))
do
run_osd $dir $osd || return 1
done
for zone in iris pze
do
ceph osd crush add-bucket $zone zone
ceph osd crush move $zone root=default
done
ceph osd crush add-bucket node-2 host
ceph osd crush add-bucket node-3 host
ceph osd crush add-bucket node-4 host
ceph osd crush add-bucket node-5 host
ceph osd crush move node-2 zone=iris
ceph osd crush move node-3 zone=iris
ceph osd crush move node-4 zone=pze
ceph osd crush move node-5 zone=pze
ceph osd crush move osd.0 host=node-2
ceph osd crush move osd.1 host=node-3
ceph osd crush move osd.2 host=node-4
ceph osd crush move osd.3 host=node-5
ceph mon set_location a zone=iris host=node-2
ceph mon set_location b zone=iris host=node-3
ceph mon set_location c zone=pze host=node-4
ceph mon set_location d zone=pze host=node-5
hostname=$(hostname -s)
ceph osd crush remove $hostname || return 1
ceph osd getcrushmap > crushmap || return 1
crushtool --decompile crushmap > crushmap.txt || return 1
sed 's/^# end crush map$//' crushmap.txt > crushmap_modified.txt || return 1
cat >> crushmap_modified.txt << EOF
rule stretch_rule {
id 1
type replicated
min_size 1
max_size 10
step take iris
step chooseleaf firstn 2 type host
step emit
step take pze
step chooseleaf firstn 2 type host
step emit
}
# end crush map
EOF
crushtool --compile crushmap_modified.txt -o crushmap.bin || return 1
ceph osd setcrushmap -i crushmap.bin || return 1
local stretched_poolname=stretched_rbdpool
ceph osd pool create $stretched_poolname 32 32 stretch_rule || return 1
ceph osd pool set $stretched_poolname size 4 || return 1
ceph mon set_location e zone=arbiter host=node-1 || return 1
ceph mon enable_stretch_mode e stretch_rule zone || return 1 # Enter strech mode
# reweight to a more round decimal.
ceph osd crush reweight osd.0 $weight
ceph osd crush reweight osd.1 $weight
ceph osd crush reweight osd.2 $weight
ceph osd crush reweight osd.3 $weight
# Firstly, we test for stretch mode buckets != 2
ceph osd crush add-bucket sham zone || return 1
ceph osd crush move sham root=default || return 1
wait_for_health "INCORRECT_NUM_BUCKETS_STRETCH_MODE" || return 1
ceph osd crush rm sham # clear the health warn
wait_for_health_gone "INCORRECT_NUM_BUCKETS_STRETCH_MODE" || return 1
# Next, we test for uneven weights across buckets
ceph osd crush reweight osd.0 0.07000
wait_for_health "UNEVEN_WEIGHTS_STRETCH_MODE" || return 1
ceph osd crush reweight osd.0 $weight # clear the health warn
wait_for_health_gone "UNEVEN_WEIGHTS_STRETCH_MODE" || return 1
teardown $dir || return 1
}
main mon-stretched-cluster-uneven-weight "$@"

View File

@ -7201,6 +7201,24 @@ void OSDMap::check_health(CephContext *cct,
ss.str(), 0);
}
}
// UNEQUAL_WEIGHT
if (stretch_mode_enabled) {
vector<int> subtrees;
crush->get_subtree_of_type(stretch_mode_bucket, &subtrees);
if (subtrees.size() != 2) {
stringstream ss;
ss << "Stretch mode buckets != 2";
checks->add("INCORRECT_NUM_BUCKETS_STRETCH_MODE", HEALTH_WARN, ss.str(), 0);
return;
}
int weight1 = crush->get_item_weight(subtrees[0]);
int weight2 = crush->get_item_weight(subtrees[1]);
stringstream ss;
if (weight1 != weight2) {
ss << "Stretch mode buckets have different weights!";
checks->add("UNEVEN_WEIGHTS_STRETCH_MODE", HEALTH_WARN, ss.str(), 0);
}
}
}
int OSDMap::parse_osd_id_list(const vector<string>& ls, set<int> *out,