#!/bin/bash # Copyright 2010-2013 Frank Liepold / 1&1 Internet AG # # Email: frank.liepold@1und1.de # # This program is free software; you can redistribute it and/or # modify it under the terms of the GNU General Public License # as published by the Free Software Foundation; either version 2 # of the License, or (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program; if not, write to the Free Software # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. ## this module provides functions to test system crashes in a running ## mars installation function crash_run { local primary_host=${main_host_list[0]} local secondary_host=${main_host_list[1]} local mars_dev=$(lv_config_get_lv_device ${cluster_mars_dir_lv_name_list[$primary_host]}) local boot_label_name="${main_host_bootloader_label_list[$primary_host]}" local res=${resource_name_list[0]} local dev=$(lv_config_get_lv_device $res) local writer_pid writer_script logfile length_logfile time_waited local net_throughput local waited=0 error_ocurred=0 mount_mount_data_device $primary_host $res resource_clear_data_device $primary_host $res lib_rw_start_writing_data_device $primary_host "writer_pid" \ "writer_script" 0 0 $res lib_vmsg " sleep $crash_time_from_write_start_to_reboot seconds" sleep $crash_time_from_write_start_to_reboot marsadm_set_proc_sys_mars_parameter $primary_host \ "logger_completion_semantics" \ $crash_logger_completion_semantics marsadm_set_proc_sys_mars_parameter $primary_host \ "aio_sync_mode" \ $crash_aio_sync_mode crash_reboot $primary_host $secondary_host $mars_dev $crash_maxtime_reboot \ $crash_maxtime_to_become_unreachable \ "$boot_label_name" lib_linktree_print_linktree $primary_host cluster_insert_mars_module $primary_host marsview_wait_for_state $primary_host $res "disk" "Uptodate" \ $crash_maxtime_state_constant lib_linktree_print_linktree $primary_host marsview_wait_for_state $primary_host $res "repl" "-SFA-" \ $crash_maxtime_state_constant lib_wait_until_action_stops "syncstatus" $secondary_host $res \ $crash_maxtime_sync \ $crash_time_constant_sync "time_waited" 0 \ "net_throughput" lib_vmsg " ${FUNCNAME[0]}: sync time: $time_waited" lib_wait_until_fetch_stops "crash" $secondary_host $primary_host $res \ "logfile" "length_logfile" "time_waited" 0 \ "net_throughput" lib_vmsg " ${FUNCNAME[0]}: fetch time: $time_waited" marsview_wait_for_state $secondary_host $res "disk" "Uptodate*" \ $marsview_wait_for_state_time || let error_occured+=1 marsview_wait_for_state $secondary_host $res "repl" "-SFA-" \ $marsview_wait_for_state_time || let error_occured+=1 lib_rw_compare_checksums $primary_host $secondary_host $res 0 "" "" if [ $error_ocurred -gt 0 ]; then echo "error_ocurred = $error_ocurred" >&2 for host in $primary_host $secondary_host; do lib_linktree_print_linktree $host done lib_exit 1 fi crash_write_data_device_and_calculate_checksums $primary_host \ $secondary_host $res $dev } function crash_write_data_device_and_calculate_checksums { local primary_host=$1 secondary_host=$2 res=$3 dev=$4 local writer_pid writer_script write_count time_waited net_throughput mount_mount_data_device $primary_host $res resource_clear_data_device $primary_host $res lib_rw_start_writing_data_device $primary_host "writer_pid" \ "writer_script" 0 0 $res lib_rw_stop_writing_data_device $primary_host $writer_script "write_count" main_error_recovery_functions["lib_rw_stop_scripts"]= lib_wait_until_action_stops "replay" $secondary_host $res \ $crash_maxtime_apply \ $crash_time_constant_apply "time_waited" 0 \ "net_throughput" lib_vmsg " ${FUNCNAME[0]}: apply time: $time_waited" marsview_wait_for_state $secondary_host $res "disk" "Uptodate" \ $crash_maxtime_state_constant marsview_wait_for_state $secondary_host $res "repl" "-SFA-" \ $crash_maxtime_state_constant mount_umount_data_device $primary_host $res lib_rw_compare_checksums $primary_host $secondary_host $res 0 "" "" } function crash_reboot { [ $# -eq 6 ] || lib_exit 1 "wrong number $# of arguments (args = $*)" local primary_host=$1 secondary_host=$2 mars_dev=$3 maxtime_to_reboot=$4 local maxtime_to_become_unreachable=$5 local boot_label_name="$6" local pids_to_kill host if [ -z "$crash_print_linktree_during_reboot" ]; then lib_exit 1 "variable crash_print_linktree_during_reboot not set" fi if [ $crash_print_linktree_during_reboot -eq 1 -a -z "$secondary_host" ] then lib_exit 1 "to print symlink trees secondary_host must be given" fi if [ "${main_host_bootloader_list[$primary_host]}" = "lilo" ]; then install_mars_activate_kernel_to_boot_with_lilo $primary_host \ "$boot_label_name" fi main_error_recovery_functions["lib_rw_stop_scripts"]= if [ $crash_print_linktree_during_reboot -eq 1 ]; then for host in $primary_host $secondary_host; do lib_linktree_print_linktree $host done fi crash_reboot_host $primary_host # pstree -lp writes s.th. like # init(1)---xterm(345) pids_to_kill=$(pstree -lp $! | sed 's/[^(][^(]*(\([0-9][0-9]*\))/\1 /g') if [ $crash_print_linktree_during_reboot -eq 1 ]; then lib_linktree_print_linktree $secondary_host fi crash_wait_to_become_unreachable $primary_host "$pids_to_kill" if [ $crash_print_linktree_during_reboot -eq 1 ]; then lib_linktree_print_linktree $secondary_host fi crash_wait_to_become_reachable $primary_host cluster_mount_mars_dir $primary_host $mars_dev if [ $crash_print_linktree_during_reboot -eq 1 ]; then for host in $primary_host $secondary_host; do lib_linktree_print_linktree $host done fi } function crash_reboot_host { local host=$1 local reboot_cmd="reboot -n -f" lib_vmsg " reboot of $host" lib_remote_idfile $host "$reboot_cmd" & } function crash_wait_to_become_reachable { local host=$1 local ssh_pid waited=0 while [ $waited -lt $maxtime_to_reboot ]; do if [ -z "$ssh_pid" ]; then if ping -c1 -W10 $host; then lib_vmsg " trying a ssh command on $host" lib_remote_idfile $host date & ssh_pid=$! else lib_vmsg " waited $waited for $host to become reachable (ping does not succeed)" sleep $crash_sleep_between_control_cmds fi else if ps -fp $ssh_pid; then lib_vmsg " waited $waited for $host to become reachable (ssh active(pid=$ssh_pid)" sleep $crash_sleep_between_control_cmds else break fi fi let waited+=$crash_sleep_between_control_cmds done if [ $waited -ge $maxtime_to_reboot ]; then lib_exit 1 " duration $maxtime_to_reboot to become reachable exceeded" fi } function crash_wait_to_become_unreachable { local host=$1 pids_to_kill="$2" pid local waited=0 while [ $waited -lt $maxtime_to_become_unreachable ]; do if ping -c1 -W10 $host; then lib_vmsg " waited $waited for $host to become unreachable (ping succeeds)" sleep $crash_sleep_between_control_cmds else break fi let waited+=$crash_sleep_between_control_cmds done for pid in $pids_to_kill; do if ps -fp $pid; then kill -1 $pid sleep 1 if ps -fp $pid; then kill -9 $pid fi fi done if [ $waited -ge $maxtime_to_become_unreachable ]; then lib_exit 1 " duration $maxtime_to_become_unreachable to become unreachable exceeded" fi }