From 0e6bb47cb6c08d53d1cb675af012b7aa24397a1d Mon Sep 17 00:00:00 2001 From: Thomas Schoebel-Theuer Date: Wed, 3 Feb 2016 22:00:47 +0100 Subject: [PATCH] marsadm: fix edge cases of try_to_avoid_splitbrain() Originally a trivial silly bug (boolean value was wrong), leading to an endless loop when a local versionlink was missing, which can happen only after a primary crash at the wrong moment shortly after a logrotate (not even during ordinary operations), followed by a hard reboot. As documented in mars-manual.pdf, you simply need "modprobe mars" to recover after such a crash reboot. MARS remembers the primary state persistently for you and restores everything _automatically_. Using "marsadm primary" in such a case to switch the current primary to primary again (after an unnecessary "marsadm secondary" which is strongly discouraged by mars-manual.pdf), although the host is / was already in primary state after the reboot, is at least as silly as the mentioned bug. Doing this in an /etc/init.d/ startup script where it really doesn't belong into, is even more silly. The latter is even an OPERATIONAL RISK, because "marsadm secondary" works _globally_ in the whole cluster (as documented in mars-manual.pdf). Such an improper startup script _can_ (potentially) disturb another cluster member which had become primary in the _meantime_ during reboot. Global cluster operations don't belong into startup scripts, because reboots may happen unintentionally at any time. --- userspace/marsadm | 72 ++++++++++++++++++++++++++++++----------------- 1 file changed, 46 insertions(+), 26 deletions(-) diff --git a/userspace/marsadm b/userspace/marsadm index fc2506e5..a5413e65 100755 --- a/userspace/marsadm +++ b/userspace/marsadm @@ -267,8 +267,10 @@ sub sleep_timeout { return; } if ($timeout <= 0) { - ldie "Timeout reached. You may retry with --timeout=-1 to ensure waiting until progress is possible.\n" if (!defined($continue) || !$continue); - lwarn "Timeout reached. Continuing anyway.\n" + if (!defined($continue) || !$continue) { + ldie "Timeout reached.\n"; + } + lwarn "Timeout reached. Continuing anyway.\n"; } my $rest = $timeout; $rest = $sleeptime if $rest > $sleeptime; @@ -1008,6 +1010,8 @@ sub log_purge_res { sub try_to_avoid_splitbrain { my ($cmd, $res, $old_primary) = @_; + my $old_timeout = $timeout; + $timeout = $window * 2 if $timeout < 0; $old_primary = "" if $old_primary eq "(none)"; wait_cluster($cmd, $res, $old_primary); if (!detect_splitbrain($res, 0)) { @@ -1015,11 +1019,15 @@ sub try_to_avoid_splitbrain { lwarn "ATTENTION: that's no good idea.\n"; lwarn "ATTENTION: I will continue to do what you want.\n"; lwarn "ATTENTION: But you are responsible for the consequences.\n"; + $timeout = $old_timeout; return; } # now try to prevent producing a _new_ split brain situation.... my @host_list = glob("$mars/resource-$res/replay-*"); + $timeout = $old_timeout; return if scalar(@host_list) < 2; + $timeout = $window * 2 if $timeout < 0; + my $old_situation = ""; for (;;) { my ($min, $max) = get_minmax_versions($res); my $vers_glob = sprintf("$mars/resource-$res/version-%09d-*", $max); @@ -1039,37 +1047,49 @@ sub try_to_avoid_splitbrain { if ($emergency) { ldie "emergency mode $emergency has been entered locally: handover is not possible. Either free some space in $mars/, or use --force to use a potentially outdated version.\n"; } - my $own_vers = sprintf("$mars/resource-$res/version-%09d-$host", $max); - if (!get_link($own_vers, 2)) { - $ok = 0; - } else { - my $primary = _get_designated_primary($res); - # if the old primary is known, we can ignore all other / unrelated hosts - if ($primary && $primary ne $host && $primary ne "(none)") { - my $p_path = sprintf("$mars/resource-$res/version-%09d-%s", $max, $primary); - my $h_path = sprintf("$mars/resource-$res/version-%09d-%s", $max, $host); - my $p_vers = get_link($p_path, 1); - my $h_vers = get_link($h_path, 1); - if (!$p_vers || !$h_vers || $p_vers ne $h_vers) { - $ok = 0; + my $primary = _get_designated_primary($res); + if ($primary eq "(none)") { + # try to determine the old primary when unique + my $glob_logs = sprintf("$mars/resource-$res/log-%09d-*", $max); + my @candidates = glob($glob_logs); + if (scalar(@candidates) == 1) { + my $log_path = pop @candidates; + if ($log_path =~ m:/log-[0-9]+-(.+)$:) { + $primary = $1; + lprint "Using last primary '$primary' as a substitute.\n"; } - } else { - # old primary is unkown: we have no chance, other than comparing _all_ versions. - my @versions = glob($vers_glob); - my $first = get_link(shift @versions); - while (@versions) { - my $next = get_link(shift @versions); - if ($next ne $first) { - $ok = 0; - } + } + } + # if the old primary is known, we can ignore all other / unrelated hosts + if ($primary && $primary ne $host && $primary ne "(none)") { + my $p_path = sprintf("$mars/resource-$res/version-%09d-%s", $max, $primary); + my $h_path = sprintf("$mars/resource-$res/version-%09d-%s", $max, $host); + my $p_vers = get_link($p_path, 1); + my $h_vers = get_link($h_path, 1); + if (!$p_vers || !$h_vers || $p_vers ne $h_vers) { + $ok = 0; + } + } else { + # old primary is unknown: we have no chance, other than comparing _all_ versions. + my @versions = glob($vers_glob); + my $first = get_link(shift @versions); + while (@versions) { + my $next = get_link(shift @versions); + if ($next ne $first) { + $ok = 0; } } } last if $ok; - lprint "trying to avoid split brain: logfile update not yet completed.\n"; - view_cmd("replinfo", $res); + lprint "Trying to avoid split brain for $timeout s: logfile update not yet completed.\n"; + my $tpl = get_macro("replinfo"); + my $new_situation = eval_macro($cmd, $res, $tpl, @_); + print $new_situation; + $timeout = $window * 2 if $new_situation ne $old_situation; sleep_timeout(); + $old_situation = $new_situation; } + $timeout = $old_timeout; } sub get_size {