#!/bin/bash
#
# This file is part of MARS project: http://schoebel.github.io/mars/
#
# Copyright (C) 2015 Thomas Schoebel-Theuer
# Copyright (C) 2015 1&1 Internet AG
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License along
# with this program; if not, write to the Free Software Foundation, Inc.,
# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.

############################################################

# TST spring 2015 lab prototype for mass rollout of MARS

# Environment-specific actions are encoded into variables.
# Change them (e.g. in /etc/mars/rollout.conf) for adaptation to
# any other operating environment.
#
# A few conventions are firmly built in: resource names and LVM disk names
# must be equal. In addition, it is advisable that VM names and
# resource names should be also strongly related (but VM names
# may have suffixes like infong4711.schlund.de).
#
# Please feel free to adapt this to your needs.

set -o pipefail

orig_vars="$(set | grep '^[_A-Za-z0-9]\+=' | cut -d= -f1)"

# Defaults for configuration variables
default_config=${default_config:-/etc/mars/rollout.conf}
# The rest is hardcoded here in case the config file does not exist
dry_run=${dry_run:-0}
verbose=${verbose:-0}
confirm=${confirm:-1}
help=${help:-0}
phase="{0..8}"
use_fake_sync=${use_fake_sync:-1}
override_fake_sync=${override_fake_sync:-0}
fakeable_resources="${fakeable_resources:-}"
sshopt="${sshopt:--4 -A -T -o StrictHostKeyChecking=no -o ForwardX11=no -o KbdInteractiveAuthentication=no -o VerifyHostKeyDNS=no}"
primary="${primary:-}"
secondary="${secondary:-}"
devices="${devices:-}"
device_pattern="${device_pattern:-/dev/vg*/{infong,ovz\}*}"
device_remove_regex="${device_remove_regex:-.-md\|old\|-bak}"
lvcreate_cmd="${lvcreate_cmd:-lvcreate -I 4M -L512G -n mars}"
drbd_force_unload="${drbd_force_unload:-0}"
drbd_dstate_cmd="${drbd_dstate_cmd:-drbdadm dstate}"
drbd_dstate_pattern="${drbd_dstate_pattern:-UpToDate/UpToDate}"
drbd_get_resources="${drbd_get_resources:-configure_InfongSpace.pl --list all | awk '{ print \$1; }' | sort -u}"
drbd_down_cmd="${drbd_down_cmd:-drbdadm down all || echo IGNORING failed DRBD shutdown because the kernel module will be unloaded anyway}"
drbd_update_config_res="${drbd_update_config_res:-configure_InfongSpace.pl --update-infong \$res repltype=mars}"
drbd_update_config_global="${drbd_update_config_global:-configure_InfongSpace.pl --write-drbd-conf}"
drbd_stop_cmd="${drbd_stop_cmd:-/etc/init.d/drbd stop || { ! [[ -e /proc/drbd ]] && echo stopping DRBD by hand && rmmod drbd; \}}"
mars_start_cmd="${mars_start_cmd:-ui-config-modify -c MARS_ENABLED=true; /etc/init.d/mars start}"
vm_reinit_cmd="${vm_reinit_cmd:-/etc/init.d/clustermanager restart; sleep 20; cm3 --stop all; sleep 5; cm3 -us}"
vm_status_cmd="${vm_status_cmd:-cm3 -us}"
vm_stopped_all_cmd="${vm_stopped_all_cmd:-cm3 --list-vms --with-status | grep -v '^\$' | grep -vi stopped | grep '.'}"
vm_stop_cmd="${vm_stop_cmd:-cm3 --stop all || { sleep 10; /etc/init.d/clustermanager restart && sleep 20 && cm3 --stop all; \}}"
vm_start_cmd="${vm_start_cmd:-/etc/init.d/clustermanager restart; sleep 20; cm3 --stop all; /etc/init.d/clustermanager restart; sleep 20; cm3 --stop all; cm3 -us; cm3 --start all; sleep 10; cm3 -us; for dummy in {0..2\}; do count=0; for i in \$(cm3 --list-vms --with-status | grep -i broken | cut -d: -f1); do echo \"RESTARTING BROKEN \$i\"; (( count++ )); cm3 -us; sleep 10; cm3 --stop \$i; done; if (( count )); then sleep 10; cm3 --start all; sleep 10; fi; done}"
# END configuration variables

param_vars="$(set | grep '^[_A-Za-z0-9]\+=' | cut -d= -f1)"

function fail
{
    local txt="${1:-Unkown failure}"
    echo "FAILURE: $txt" >> /dev/stderr
    exit -1
}

function do_confirm
{
    local skip="$1"
    local response

    (( !confirm )) && return 0

    [[ "$skip" != "" ]] && skip="S to skip, "
    echo -n "[CONFIRM: Press ${skip}Return to continue, ^C to abort] "
    read response
    ! [[ "$response" =~ ^[sS] ]]
    return $?
}

function remote
{
    local host="$1"
    local cmd="$2"
    local nofail="${3:-0}"

    (( verbose > 0 )) && echo "Executing on $host: '$cmd'" >> /dev/stderr
    [[ "${cmd## }" = "" ]] && return 0
    if ssh $sshopt root@$host "$cmd"; then
	return 0
    elif (( nofail )); then
	return $?
    else
	fail "ssh to '$host' command '$cmd' failed with status $?"
    fi
}

function remote_action
{
    local host="$1"
    local cmd="$2"

    if (( dry_run )); then
	echo "DRY_RUN REMOTE $host ACTION '$cmd'"
    elif (( confirm )); then
	echo "REMOTE $host ACTION '$cmd'"
	if do_confirm 1; then
	    remote "$host" "$cmd"
	else
	    echo "SKIPPING $host ACTION '$cmd'"
	fi
    else
	remote "$host" "$cmd"
    fi
}

function _get_resource
{
    local device="${1:-$(fail "Resource argument is missing")}" || exit $?
    echo "$device" | sed 's:^.*/::'
}

function are_all_vms_stopped
{
    local host="$1"
    local ret=$(remote $host "{ $vm_stopped_all_cmd ; } 1>&2 ; echo \$?")
    echo "VMs on $host are $( (( !ret )) && echo "NOT ")stopped" >> /dev/stderr
    return $(( !ret ))
}

function source_when_possible
{
    local file="$1"
    local type="$2"

    if [[ -r "$file" ]]; then
	. "$file" || fail "$type file $file is not parsable"
    fi
}

source_when_possible "$default_config" "config"

# Allow forceful override of any _known_ variable at the command line
for i; do
    if [[ "$i" =~ ^--[-_A-Za-z0-9]+$ ]]; then
	param="${i#--}"
	var="${param//-/_}"
        [[ "$(eval "echo \"\$$var\"")" = "" ]] && abort "Variable '$var' is unknown"
	eval "$var=1"
    elif [[ "$i" =~ ^--[-_A-Za-z0-9]+= ]]; then
	param="${i#--}"
	var="${param%%=*}"
	var="${var//-/_}"
	val="${param#*=}"
        [[ "$(eval "echo \"\$$var\"")" = "" ]] && abort "Variable '$var' is unknown"
	eval "$var=$val"
    elif [[ "$i" =~ ^-h$ ]]; then
	help=1
    elif [[ "$i" =~ ^-v$ ]]; then
	(( verbose++ ))
    elif [[ "$primary" = "" ]]; then
	primary="$i"
    elif [[ "$secondary" = "" ]]; then
	secondary="$i"
    else
	abort "bad parameter syntax '$i'"
    fi
done

function do_help
{
cat <<EOF
usage: $0 [options] <primaryhost> <secondaryhost>

The following parameter variables can be either passed by the
environment, or used for hard overriding on the command line
via --variable=value syntax:

$(
    declare -A orig
    for i in $orig_vars; do
	orig[$i]=1
    done
    for i in $param_vars; do
	[[ "$i" =~ _vars$ ]] && continue
	if (( !orig[$i] )); then
	    if [[ "$(eval "echo \${$i}")" =~ ^[0-9]+$ ]]; then
		echo "$i=$(eval "echo \${$i}")"
	    else
		echo "$i=\"$(eval "echo \${$i}")\""
	    fi
	fi
    done
)
EOF
}

if (( help )); then
    do_help
    exit 0
fi

if [[ "$primary" = "" ]]; then
    do_help
    fail "No primary hostname given"
fi
if [[ "$secondary" = "" ]]; then
    do_help
    fail "No secondary hostname given"
fi
[[ "$primary" = "$secondary" ]] && fail "Primary and secondary hostnames must be distinct"

function do_phase
{
    local phase="$1"
    local host

    echo ""
    echo "------- Phase $phase"
    echo ""

    case "$phase" in
	0)
	echo "Create the /mars filesystem when necessary, ensure that it is mounted"
	for host in $primary $secondary; do
	    if (( $(remote $host "ls /dev/*/mars 1>&2; echo \$?") )); then
		local line="$(remote $host "vgdisplay -c | sort -n -t: -k16 -r | head -1")" || fail "Cannot determine VG"
		local vg_name="$(echo "$line" | cut -d: -f1)"
		[[ "${vg_name// /}" = "" ]] && fail "Invalid VG name '$vg_name'"
		local pv_count="$(echo "$line" | cut -d: -f10)"
		(( pv_count < 1 )) && fail "Invalid PV count '$pv_count'"
		echo "Host $host VG '$vg_name' (has $pv_count physical volumes)"
		remote_action $host "$lvcreate_cmd -i $pv_count $vg_name"
		sleep 2
		if (( $(remote $host "ls /dev/*/mars 1>&2; echo \$?") )); then
		    fail "No LV for /mars exists on $host"
		fi
	    fi
	    if (( $(remote $host "grep -q /mars /proc/mounts; echo \$?") )); then
		remote_action $host "[[ -d /mars ]] || mkdir /mars; mount /mars || { mkfs.ext4 -L mars /dev/*/mars && mount /dev/*/mars /mars; }"
		if (( $(remote $host "grep -q /mars /proc/mounts; echo \$?") )); then
		    fail "No /mars is mounted on $host"
		fi
	    fi
	done
	;;

	1)
	echo "Create/join the MARS cluster when necessary"
	if (( $(remote $primary "ls -l /mars/uuid 1>&2; echo \$?") )); then
		echo "Host $primary create-cluster"
		remote_action $primary "marsadm create-cluster"
	fi
	if (( $(remote $secondary "ls -l /mars/uuid 1>&2; echo \$?") )); then
		echo "Host $secondary join-cluster"
		remote_action $secondary "marsadm join-cluster $primary"
	fi
	;;

	2)
	echo "Stop VMs when necessary"
	for host in $primary $secondary; do
	    if are_all_vms_stopped $host; then
		echo "No VMs are running on host $host."
	    else
		echo "Some VMs are running on host $host"
		(( !downtime_start )) && downtime_start=$(date +%s)
		remote_action $host "$vm_stop_cmd"
		downtime_end=$(date +%s)
		echo "ESTIMATED operation duration: $(( downtime_end - downtime_start )) seconds"
		if ! are_all_vms_stopped $host; then
		    fail "Some VMs are running on host $host"
		fi
	    fi
	done
	if (( downtime_start )); then
	    echo "ESTIMATED total shutdown operation duration: $(( downtime_end - downtime_start )) seconds"
	fi
	;;

	3)
	echo "Stop DRBD when necessary"
	if (( drbd_force_unload || !$(remote $primary "[[ -e /proc/drbd ]]; echo \$?") )); then
	    local drbd_res="$(remote $primary "$drbd_get_resources")" || fail "Cannot get DRBD resources on $primary"
	    echo "DRBD resources on host $primary: $(echo $drbd_res)"
	    local cmd="for i in $(echo $drbd_res); do echo -n \"\$i \"; $drbd_dstate_cmd \$i; done"
	    echo "DRBD dstate on host $primary:"
	    local tmpfile=/tmp/dstate.$primary.$$
	    remote $primary "$cmd" | tee $tmpfile
	    if grep -qv "$drbd_dstate_pattern" < $tmpfile; then
		echo "DRBD on $primary is NOT in sync"
	    else
		echo "DRBD on $primary is in sync"
	    fi
	    if (( use_fake_sync )); then
		echo "The following resources are fakeable:"
		while read res txt; do
		    echo "$res $txt"
		    fakeable_resources+=" $res"
		done <<EOF
$(grep "$drbd_dstate_pattern" < $tmpfile)
EOF
		echo "List of fakeable DRBD resources: $fakeable_resources"
	    fi
	    rm -f $tmpfile
	    for host in $primary $secondary; do
		echo "Creating DRBD backup on $host"
		remote_action $host "tar czvf /var/backups/drbd-config-$(date +%Y%m%d-%H%M).tgz /etc/drbd* || true"
		echo "Shutdown DRBD on $host"
		remote_action $host "$drbd_down_cmd"
		local res
		local cmd=""
		for res in $drbd_res; do
		    cmd+="${drbd_update_config_res/\$res/$res} ; "
		done
		cmd+="$drbd_update_config_global ; $drbd_stop_cmd"
		remote_action $host "$cmd"
	    done
	else
	    echo "DRBD is NOT in use, switching off fake-sync"
	    use_fake_sync=0
	fi
	;;

	4)
	echo "Start MARS when necessary"
	for host in $primary $secondary; do
	    if (( $(remote $host "[[ -d /proc/sys/mars ]]; echo \$?") )); then
		remote_action $host "$mars_start_cmd"
		sleep 3 &
	    else
		echo "MARS is already running on $host"
	    fi
	done
	wait

	for host in $primary $secondary; do
	    local device
	    local cmd=""
	    for device in $(eval "echo \${devices_${host//-/_}}"); do
		local res="$(_get_resource $device)"
		[[ "$res" = "" ]] && fail "Implausible resource name '$res'"
		local this_size=${sizes[$res]}
		(( this_size < 4096 )) && fail "Implausible device size '$this_size'"
		if (( $(remote $host "[[ -e /mars/resource-$res/data-$host ]]; echo \$?") )); then
		    echo "RESOURCE $res on $host: device $device size $this_size"
		    if [[ "$host" = "$primary" ]]; then
			cmd+="marsadm create-resource $res $device $res $this_size && "
		    else
			[[ "$cmd" = "" ]] && cmd="marsadm wait-cluster ; "
			cmd+="marsadm join-resource $res $device && "
		    fi
		else
		    echo "RESOURCE $res already exists on $host"
		fi
	    done
	    if [[ "$cmd" != "" ]]; then
		remote_action $host "$cmd true"
	    fi
	done
	if (( use_fake_sync )) && [[ "$fakeable_resources" != "" ]]; then
	    echo "Starting FAKE-SYNC on resources $fakeable_resources"
	    remote_action $secondary "for i in $fakeable_resources; do marsadm fake-sync \$i; done"
	elif (( override_fake_sync )); then
	    echo "OVERRIDING FAKE-SYNC on ALL resources"
	    remote_action $secondary "marsadm fake-sync all"
	else
	    echo "no fake-sync is executed"
	fi
	;;

	5)
	echo "Show status of MARS"
	for host in $primary $secondary; do
	    echo ""
	    echo "MARS Status on $host:"
	    remote $host "marsadm view all"
	done
	;;

	6)
	echo "Reinit VM clustermanager"
	for host in $primary $secondary; do
	    echo "------ Reinit $host:"
	    remote_action $host "$vm_reinit_cmd"
	done
	;;

	7)
	echo "Start VMs when necessary"
	if are_all_vms_stopped $primary; then
	    uptime_start=$(date +%s)
	    remote_action $primary "$vm_start_cmd"
	    final=$(date +%s)
	    echo "ESTIMATED startup duration: $(( final - uptime_start )) seconds"
	    if (( downtime_start )); then
		echo "ESTIMATED total VM downtime: $(( final - downtime_start )) seconds"
	    fi
	    echo ""
	else
	    echo "Some VMs are running on host $primary. Please check by hand whether some of them need a restart."
	fi
	;;

	8)
	echo "Show status of VMs"
	for host in $primary $secondary; do
	    echo "------ Status on $host:"
	    remote $host "$vm_status_cmd"
	done
	;;

	*)
	echo "Unknown / unimplemented phase '$phase'"
	;;
    esac
}

function main
{
    echo "Script $0 running phase $phase"
    echo ""
    echo "Params: $0 $*"
    echo ""
    echo "primary:   '$primary'"
    echo "secondary: '$secondary'"
    echo ""

    script_start=$(date +%s)

    for host in $primary $secondary; do
	ping -c 1 $host || fail "Host '$primary' is not reachable"
	remote $host uptime || fail "ssh connection to '$host' does not work. Ensure that ssh-agent is running."
    done
    echo ""

# when necessary, determine list of devices

    if [[ "$devices" = "" ]]; then
	for host in $primary $secondary; do
	    eval "devices_${host//-/_}=\"$(remote $host "ls $device_pattern" 2>/dev/null | grep -v "$device_remove_regex")\"" || fail "cannot determine devices on $host"
	    eval "echo devices_${host//-/_}: \${devices_${host//-/_}}"
	done
    else
	for host in $primary $secondary; do
	    eval "devices_${host//-/_}=\"$devices\""
	done
	echo "Using given devices '$devices' for both hosts $primary $secondary"
    fi

    for host in $primary $secondary; do
	[[ "$(eval "echo \${devices_${host//-/_}}")" = "" ]] && fail "No devices have been determined on $host"
	eval "resources_${host//-/_}=\"\$(for i in \${devices_${host//-/_}}; do _get_resource "\$i"; done | sort)\""
	eval "echo resources_${host//-/_}: \${resources_${host//-/_}}"
	[[ "$(eval "echo \${resources_${host//-/_}}")" = "" ]] && fail "No resources have been determined on $host"
    done
    if [[ "$(eval "echo \${resources_${primary//-/_}}")" != "$(eval "echo \${resources_${secondary//-/_}}")" ]]; then
	fail "Primary resource list is different from secondary resource list"
    fi

    declare -A sizes

    for host in $primary $secondary; do
	echo "Host $host:"
	while read device sector_size; do
	    this_size=$(( sector_size * 512 ))
	    echo "  device $device: size $this_size"
	    this_resource="$(_get_resource $device)"
	    if (( !sizes[$this_resource] || this_size < sizes[$this_resource] )); then
		sizes[$this_resource]=$this_size
	    fi
	done <<EOF
$(remote $host "/sbin/lvdisplay -c $(eval "echo \${devices_${host//-/_}}") | cut -d: -f1,7" | sed 's/:/ /')
EOF
    done

    echo ""
    echo "Determined the following sizes:"
    for res in ${!sizes[*]}; do
	echo "  $res: ${sizes[$res]}"
    done
    echo ""

    do_confirm

    for this_phase in $(eval "echo $phase"); do
	do_phase $this_phase
    done

    script_end=$(date +%s)
    echo "ESTIMATED script duration: $(( script_end - script_start )) seconds"
}

downtime_start=0
uptime_start=0

main 2>&1 | tee rollout-$(date +%Y%m%d-%H%M).$primary.$secondary.log