mirror of https://github.com/schoebel/mars
test_suite: complete split brain tests
This commit is contained in:
parent
a117a2fa86
commit
d100310abb
|
@ -24,7 +24,7 @@ main_base_directory=/home/fl/mars/mars-git-hub/test_suite
|
|||
|
||||
## hosts the tests are running on. The first host is initially always used as
|
||||
## primary host
|
||||
main_host_list=("istore-test-bs4" "istore-test-bap4")
|
||||
main_host_list=("istore-test-bs7" "istore-test-bap7")
|
||||
|
||||
## file to implement a primitive locking mechanism to avoid concurrent runs
|
||||
## of the test suite on the same host
|
||||
|
|
|
@ -76,3 +76,12 @@ switch2primary_logrotate_split_brain_orig_secondary=0
|
|||
## the test of the new primary
|
||||
## primary resp. secondary
|
||||
switch2primary_logrotate_new_primary=0
|
||||
|
||||
## prefix of the name of the script, which writes some data on
|
||||
## the data devices of the both primaries
|
||||
switch2primary_write_script_prefix="/tmp/$main_prefix_scripts-split_brain"
|
||||
|
||||
## time for which the amount of data to sync must be constant
|
||||
## to declare the sync process as inactive
|
||||
switch2primary_time_constant_initial_sync=3
|
||||
|
||||
|
|
|
@ -31,13 +31,11 @@ function switch2primary_run
|
|||
|
||||
lib_wait_for_initial_end_of_sync $primary_host $secondary_host $res \
|
||||
$resource_maxtime_initial_sync \
|
||||
$resource_time_constant_initial_sync \
|
||||
$switch2primary_time_constant_initial_sync \
|
||||
"time_waited"
|
||||
lib_vmsg " ${FUNCNAME[0]}: sync time: $time_waited"
|
||||
|
||||
marsadm_do_cmd $primary_host "primary" "$res" || lib_exit 1
|
||||
|
||||
mount_mount_data_device
|
||||
mount_mount_data_device $primary_host $res
|
||||
resource_clear_data_device $primary_host $res
|
||||
|
||||
lib_rw_start_writing_data_device $primary_host "writer_pid" \
|
||||
|
@ -105,40 +103,79 @@ function switch2primary_run
|
|||
marsadm_do_cmd $secondary_host "secondary" "$res" || lib_exit 1
|
||||
}
|
||||
|
||||
# we assume that the script writer_script is running on the primary_host
|
||||
# the process flow in switch2primary_force is as follows:
|
||||
#
|
||||
# - start writing data dev orig_primary (already done, when we are here)
|
||||
# - logrotate and logdelete on orig_primary (if switch2primary_logrotate_orig_primary == 1)
|
||||
# - stop writing data dev orig_primary (if switch2primary_data_dev_in_use == 0)
|
||||
# - cut network connection (if switch2primary_orig_primary_alive == 0)
|
||||
# - marsadm --force primary on orig_secondary
|
||||
# - logrotate and logdelete on orig_primary (if switch2primary_logrotate_orig_primary == 1 and switch2primary_data_dev_in_use == 1)
|
||||
# - stop writing data device primary (if switch2primary_data_dev_in_use == 1)
|
||||
#
|
||||
# Now we should have a real split brain and should be able to write on both
|
||||
# data devices. The process flow continues:
|
||||
#
|
||||
# - start writing both data devices
|
||||
# - logrotate and logdelete on orig_primary (if switch2primary_logrotate_split_brain_orig_primary == 1)
|
||||
# - logrotate and logdelete on orig_secondary (if switch2primary_logrotate_split_brain_orig_secondary == 1)
|
||||
# - stop writing both data devices
|
||||
# - recreate network connection (if switch2primary_orig_primary_alive == 0)
|
||||
#
|
||||
# Now we try to solve the split brain. See switch2primary_correct_split_brain.
|
||||
function switch2primary_force
|
||||
{
|
||||
[ $# -eq 4 ] || lib_exit 1 "wrong number $# of arguments (args = $*)"
|
||||
local primary_host=$1 secondary_host=$2 res=$3 writer_script=$4
|
||||
local write_count time_waited host logfile length_logfile net_throughput
|
||||
# # replace string remote_host with $secondary_host
|
||||
# declare -A impact_cmd
|
||||
# eval impact_cmd=(\
|
||||
# $(for x in ${!net_impact_cmd[@]};do
|
||||
# printf "[$x]='${net_impact_cmd[$x]//remote_host/$primary_host}' ";
|
||||
# done)\
|
||||
# )
|
||||
# lib_vmsg "sleep 10"
|
||||
# sleep 10
|
||||
# net_do_impact_cmd $host "impact_cmd" "off"
|
||||
if [ $switch2primary_logrotate_orig_primary -eq 1 ]; then
|
||||
logrotate_loop $primary_host $res 3 4
|
||||
fi
|
||||
if [ $switch2primary_data_dev_in_use -eq 0 ]; then
|
||||
switch2primary_stop_write_and_umount_data_device $primary_host \
|
||||
$writer_script "write_count"
|
||||
fi
|
||||
if [ $switch2primary_orig_primary_alive -eq 0 ]; then
|
||||
net_do_impact_cmd $secondary_host "on" "remote_host=$primary_host"
|
||||
fi
|
||||
marsadm_do_cmd $secondary_host "--force primary" "$res" || lib_exit 1
|
||||
lib_rw_stop_writing_data_device $primary_host $writer_script "write_count"
|
||||
if [ $switch2primary_logrotate_orig_primary -eq 1 ]; then
|
||||
logrotate_loop $primary_host $res 3 4
|
||||
fi
|
||||
if [ $switch2primary_data_dev_in_use -ne 0 ]; then
|
||||
switch2primary_stop_write_and_umount_data_device $primary_host \
|
||||
$writer_script "write_count"
|
||||
fi
|
||||
lib_vmsg " ${FUNCNAME[0]}: write_count: $write_count"
|
||||
main_error_recovery_functions["lib_rw_stop_scripts"]=
|
||||
lib_wait_until_fetch_stops "switch2primary" $secondary_host $primary_host \
|
||||
$res "logfile" "length_logfile" "time_waited" \
|
||||
0 "net_throughput"
|
||||
lib_vmsg " ${FUNCNAME[0]}: fetch time: $time_waited"
|
||||
|
||||
switch2primary_wait_for_first_own_logfile_on_new_primary $secondary_host \
|
||||
$res
|
||||
for host in $primary_host $secondary_host; do
|
||||
switch2primary_check_write_to_logfiles $host $res
|
||||
done
|
||||
# TODO:rm: lib_wait_until_fetch_stops "switch2primary" $secondary_host \
|
||||
# TODO:rm: $primary_host $res "logfile" \
|
||||
# TODO:rm: "length_logfile" "time_waited" \
|
||||
# TODO:rm: 0 "net_throughput"
|
||||
# TODO:rm: lib_vmsg " ${FUNCNAME[0]}: fetch time: $time_waited"
|
||||
|
||||
# TODO:rm: switch2primary_wait_for_first_own_logfile_on_new_primary \
|
||||
# TODO:rm: $secondary_host $res
|
||||
|
||||
switch2primary_write_both_data_devices $primary_host $secondary_host $res
|
||||
|
||||
if [ $switch2primary_orig_primary_alive -eq 0 ]; then
|
||||
net_do_impact_cmd $secondary_host "off" "remote_host=$primary_host"
|
||||
fi
|
||||
|
||||
switch2primary_correct_split_brain $primary_host $secondary_host $res
|
||||
|
||||
}
|
||||
|
||||
function switch2primary_stop_write_and_umount_data_device
|
||||
{
|
||||
local host=$1 writer_script=$2 varname_write_count=$3
|
||||
lib_rw_stop_writing_data_device $host $writer_script $varname_write_count
|
||||
main_error_recovery_functions["lib_rw_stop_scripts"]=
|
||||
mount_umount_data_device $host $res
|
||||
}
|
||||
|
||||
function switch2primary_wait_for_first_own_logfile_on_new_primary
|
||||
{
|
||||
local host=$1 res=$2
|
||||
|
@ -158,41 +195,105 @@ function switch2primary_wait_for_first_own_logfile_on_new_primary
|
|||
}
|
||||
|
||||
# check whether write access to the data device causes writes to the logfiles
|
||||
function switch2primary_check_write_to_logfiles
|
||||
function switch2primary_write_both_data_devices
|
||||
{
|
||||
local host=$1 res=$2
|
||||
local length_logfile length_logfile_old
|
||||
local dev=$(resource_get_data_device $res)
|
||||
length_logfile_old=$(perftest_get_length_last_logfile $host $res $host)
|
||||
lib_vmsg " length last logfile on $host: $length_logfile_old"
|
||||
lib_remote_idfile $host \
|
||||
"yes | dd oflag=direct bs=4096 count=1 of=$dev" || \
|
||||
lib_exit 1
|
||||
length_logfile=$(perftest_get_length_last_logfile $host $res $host)
|
||||
lib_vmsg " length last logfile on $host: $length_logfile"
|
||||
if [ $length_logfile -eq $length_logfile_old ]; then
|
||||
lib_exit 1 "nothing written to logfiles on $host"
|
||||
local primary_host=$1 secondary_host=$2 res=$3
|
||||
local data_dev=$(resource_get_data_device $res)
|
||||
local script=$switch2primary_write_script_prefix.$$
|
||||
local writer_script writer_pid write_count host
|
||||
local rm_opt="no_rm"
|
||||
declare -A last_logfile_old
|
||||
declare -A length_last_logfile_old
|
||||
# this script will be started
|
||||
echo '#/bin/bash
|
||||
while true; do
|
||||
# filter dd standard messages (records in, records out) from stderr
|
||||
yes xyz | dd oflag=direct bs=4096 count=1000 of='$data_dev' status=noxfer 3>&2 2>&1 >&3 | grep -v records 3>&2 2>&1 >&3
|
||||
sleep 1
|
||||
done' >$script
|
||||
for host in $primary_host $secondary_host; do
|
||||
# both hosts must have a least one logfile
|
||||
last_logfile_old[$host]=$(marsadm_get_last_logfile $host $res $host) \
|
||||
|| lib_exit 1
|
||||
length_last_logfile_old[$host]=$(file_handling_get_file_length \
|
||||
$host ${last_logfile_old[$host]}) \
|
||||
|| lib_exit 1
|
||||
lib_vmsg " last logfile:length on $host: ${last_logfile_old[$host]}:${length_last_logfile_old[$host]}"
|
||||
lib_start_script_remote_bg $host $script "writer_pid" "writer_script" \
|
||||
$rm_opt
|
||||
main_error_recovery_functions["lib_rw_stop_scripts"]+="$host $script "
|
||||
rm_opt="rm"
|
||||
done
|
||||
if [ $switch2primary_logrotate_split_brain_orig_primary -eq 1 ]; then
|
||||
logrotate_loop $primary_host $res 3 2
|
||||
fi
|
||||
if [ $switch2primary_logrotate_split_brain_orig_secondary -eq 1 ]; then
|
||||
logrotate_loop $secondary_host $res 3 2
|
||||
fi
|
||||
for host in $primary_host $secondary_host; do
|
||||
lib_rw_stop_one_script $host $script "write_count"
|
||||
done
|
||||
main_error_recovery_functions["lib_rw_stop_scripts"]=
|
||||
for host in $primary_host $secondary_host; do
|
||||
local last_logfile length_logfile
|
||||
last_logfile=$(marsadm_get_last_logfile $host $res $host) || lib_exit 1
|
||||
length_last_logfile=$(file_handling_get_file_length \
|
||||
$host $last_logfile) || lib_exit 1
|
||||
lib_vmsg " act. last logfile:length on $host: $last_logfile:$length_last_logfile"
|
||||
if [ $last_logfile = ${last_logfile_old[$host]} \
|
||||
-a $length_last_logfile -eq ${length_last_logfile_old[$host]} ]
|
||||
then
|
||||
lib_exit 1 "nothing written to logfiles on $host"
|
||||
fi
|
||||
done
|
||||
}
|
||||
|
||||
function switch2primary_correct_split_brain
|
||||
{
|
||||
local host=$1 # the former primary
|
||||
local primary_host=$2 res=$3
|
||||
local orig_primary=$1 orig_secondary=$2 res=$3
|
||||
local new_primary new_secondary
|
||||
local dev=$(resource_get_data_device $res)
|
||||
local time_waited
|
||||
mount_umount_data_device $host $res
|
||||
marsadm_do_cmd $host "secondary" "$res" || lib_exit 1
|
||||
marsadm_do_cmd $primary_host "invalidate" "$res" || lib_exit 1
|
||||
lib_wait_for_initial_end_of_sync $primary_host $host $res \
|
||||
if [ $switch2primary_orig_prim_equal_new_prim -ne 0 ]; then
|
||||
new_primary=$orig_primary
|
||||
new_secondary=$orig_secondary
|
||||
else
|
||||
new_primary=$orig_secondary
|
||||
new_secondary=$orig_primary
|
||||
fi
|
||||
marsadm_do_cmd $new_secondary "secondary" "$res" || lib_exit 1
|
||||
if [ $switch2primary_activate_secondary_hardcore -eq 0 ]; then
|
||||
marsadm_do_cmd $new_secondary "invalidate" "$res" || lib_exit 1
|
||||
else
|
||||
local lv_dev=$(lv_config_get_lv_device $res)
|
||||
marsadm_do_cmd $new_secondary "--force leave-resource" "$res" || \
|
||||
lib_exit 1
|
||||
marsadm_do_cmd $new_secondary "--force join-resource" "$res $lv_dev" \
|
||||
|| lib_exit 1
|
||||
fi
|
||||
lib_wait_for_initial_end_of_sync $new_primary $new_secondary $res \
|
||||
$resource_maxtime_initial_sync \
|
||||
$resource_time_constant_initial_sync \
|
||||
"time_waited"
|
||||
lib_vmsg " write some data to $primary_host:$dev"
|
||||
lib_remote_idfile $primary_host \
|
||||
"yes | dd oflag=direct bs=4096 count=1 of=$dev" || \
|
||||
lib_exit 1
|
||||
lib_vmsg " write some data to $new_primary:$dev"
|
||||
local count=0 maxcount=3
|
||||
while true; do
|
||||
lib_remote_idfile $new_primary \
|
||||
"yes | dd oflag=direct bs=4096 count=1 of=$dev" || \
|
||||
lib_exit 1
|
||||
if [ $switch2primary_logrotate_new_primary -eq 0 ]; then
|
||||
break
|
||||
fi
|
||||
marsadm_do_cmd $new_primary "log-rotate" $res || lib_exit 1
|
||||
if [ $(($count % 2 )) -eq 0 ]; then
|
||||
marsadm_do_cmd $new_primary "log-delete" $res || lib_exit 1
|
||||
fi
|
||||
let count+=1
|
||||
if [ $count -eq $maxcount ]; then
|
||||
break
|
||||
fi
|
||||
done
|
||||
lib_wait_for_secondary_to_become_uptodate_and_cmp_cksums "resource" \
|
||||
$host $primary_host \
|
||||
$new_secondary $new_primary \
|
||||
$res $dev 0
|
||||
}
|
||||
|
|
|
@ -0,0 +1,23 @@
|
|||
#!/bin/bash
|
||||
# Copyright 2010-2013 Frank Liepold / 1&1 Internet AG
|
||||
#
|
||||
# Email: frank.liepold@1und1.de
|
||||
#
|
||||
# This program is free software; you can redistribute it and/or
|
||||
# modify it under the terms of the GNU General Public License
|
||||
# as published by the Free Software Foundation; either version 2
|
||||
# of the License, or (at your option) any later version.
|
||||
#
|
||||
# This program is distributed in the hope that it will be useful,
|
||||
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
# GNU General Public License for more details.
|
||||
#
|
||||
# You should have received a copy of the GNU General Public License
|
||||
# along with this program; if not, write to the Free Software
|
||||
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
|
||||
|
||||
#####################################################################
|
||||
switch2primary_logrotate_orig_primary=1
|
||||
|
||||
|
|
@ -0,0 +1,21 @@
|
|||
#!/bin/bash
|
||||
# Copyright 2010-2013 Frank Liepold / 1&1 Internet AG
|
||||
#
|
||||
# Email: frank.liepold@1und1.de
|
||||
#
|
||||
# This program is free software; you can redistribute it and/or
|
||||
# modify it under the terms of the GNU General Public License
|
||||
# as published by the Free Software Foundation; either version 2
|
||||
# of the License, or (at your option) any later version.
|
||||
#
|
||||
# This program is distributed in the hope that it will be useful,
|
||||
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
# GNU General Public License for more details.
|
||||
#
|
||||
# You should have received a copy of the GNU General Public License
|
||||
# along with this program; if not, write to the Free Software
|
||||
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
|
||||
|
||||
#####################################################################
|
||||
switch2primary_logrotate_split_brain_orig_primary=1
|
|
@ -0,0 +1,21 @@
|
|||
#!/bin/bash
|
||||
# Copyright 2010-2013 Frank Liepold / 1&1 Internet AG
|
||||
#
|
||||
# Email: frank.liepold@1und1.de
|
||||
#
|
||||
# This program is free software; you can redistribute it and/or
|
||||
# modify it under the terms of the GNU General Public License
|
||||
# as published by the Free Software Foundation; either version 2
|
||||
# of the License, or (at your option) any later version.
|
||||
#
|
||||
# This program is distributed in the hope that it will be useful,
|
||||
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
# GNU General Public License for more details.
|
||||
#
|
||||
# You should have received a copy of the GNU General Public License
|
||||
# along with this program; if not, write to the Free Software
|
||||
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
|
||||
|
||||
#####################################################################
|
||||
switch2primary_logrotate_split_brain_orig_secondary=1
|
|
@ -22,6 +22,7 @@
|
|||
|
||||
switch2primary_force=1
|
||||
lib_rw_part_of_device_size_written_per_loop=10
|
||||
logrotate_number_of_rotates_before_delete=2
|
||||
|
||||
run_list="resource_prepare resource_run_first switch2primary_run lib_general_checks_after_every_test"
|
||||
run_list="resource_quick_prepare_first_resource switch2primary_run lib_general_checks_after_every_test"
|
||||
|
|
Loading…
Reference in New Issue