mars/userspace/marsadm

2069 lines
63 KiB
Perl
Executable File

#!/usr/bin/perl -w
# (c) 2010 Thomas Schoebel-Theuer / 1&1 Internet AG
use strict;
use English;
use warnings;
umask 0077;
##################################################################
# messaging
my $error_count = 0;
my $notify = "";
sub lprint {
my ($text) = @_;
print $text;
if ($notify) {
system("/usr/bin/logger -t marsadm \"$notify $text\"");
}
}
sub lprint_stderr {
my ($text) = @_;
print STDERR $text;
if ($notify) {
system("/usr/bin/logger -t marsadm \"$notify $text\"");
}
}
sub ldie {
my ($text) = @_;
$error_count++;
lprint_stderr "DYING: $text";
die "\n";
}
sub lwarn {
my ($text) = @_;
lprint_stderr "WARNING: $text";
}
##################################################################
# low-level infrastructure
my @link_list = ();
my %link_hash;
my $verbose = 0;
my $dry_run = 0;
sub get_link {
my ($path, $unchecked) = @_;
my $result = readlink($path);
if (!defined($result)) {
ldie "cannot read symlink '$path'\n" unless $unchecked;
lwarn "cannot read symlink '$path'\n" if $unchecked == 1;
}
return $result;
}
sub is_link_recent {
my ($path) = @_;
my @stat = lstat($path);
return 0 if (!@stat);
return 1 if $stat[9] + 15 >= mars_time();
return 0;
}
sub to_tmp {
my $path = shift;
$path =~ s:^(.*)/:$1/.tmp.:;
return $path;
}
sub from_tmp {
my $path = shift;
$path =~ s:^(.*)/\.tmp\.:$1/:;
return $path;
}
sub set_link {
my ($src, $dst) = @_;
my $dst_tmp = to_tmp($dst);
unlink($dst_tmp);
symlink($src, $dst_tmp) or ldie "cannot create symlink '$dst' -> '$src'\n";
# the _order_ is important! remove existing intermediate element before re-appanding
if (exists($link_hash{$dst})) {
my @copy = @link_list;
@link_list = ();
foreach my $elem (@copy) {
next if $elem eq $dst;
push @link_list, $elem;
}
}
$link_hash{$dst} = $src;
push @link_list, $dst;
}
sub finish_links {
return unless @link_list;
my $timestamp = mars_time();
lprint "using lamport timestamp $timestamp\n" if $verbose;
while (my $link = shift @link_list) {
my $link_tmp = to_tmp($link);
my $target = readlink($link_tmp);
my $this_timestamp = $timestamp;
# allow overriding of secondaries in partitioned clusters by use of small timestamps
if ($target eq "(none)") {
my @stat = lstat($link);
$this_timestamp = $stat[9] + 1 if @stat;
}
system("touch -h -d \"\@$this_timestamp\" $link_tmp") == 0 or ldie "cannot set mtime on symlink '$link_tmp'\n";
if ($dry_run) {
lprint "DRY_RUN: would create symlink '$link' -> '$target'\n";
unlink($link_tmp);
next;
}
rename($link_tmp, $link) or ldie "cannot finalize symlink '$link'\n";
if ($verbose) {
lprint "created symlink '$link' -> '$target'\n";
}
}
_trigger();
}
##################################################################
# global variables and checks
my $Id = '$Id$ ';
my $user_version = 0.1;
my $mars = "/mars";
my $host = `uname -n` or ldie "cannot determine my network node name\n";
chomp $host;
my $real_host = $host;
my $force = 0;
my $timeout = -1;
my $ip = "";
my $kernel_version = 0;
unless (defined($ARGV[0]) && $ARGV[0] =~ m/cluster|cat/) {
$kernel_version = get_link("$mars/tree-$host", 1);
if ($kernel_version && $user_version != $kernel_version) {
lwarn "kernel_version=$kernel_version user_version=$user_version\n";
if ($user_version < $kernel_version) {
ldie "Sorry, your MARS kernel module uses version $kernel_version, but my $0 userspace version is only $user_version. That cannot work. Please upgrade your userspace scripts!\n";
} elsif (int($user_version) > int($kernel_version) + 1) {
ldie "MAJOR VERSION mismatch : only one major upgrade step is supported.\n";
}
lwarn "using different minor versions is possible, but you should upgrade your kernel module ASAP\n";
}
}
sub get_alive_links {
my $res = shift || "all";
my $alive = shift || "alive";
my $glob = "$mars/$alive-*";
if ($res ne "all") {
$glob = "$mars/$alive-{";
my $count = 0;
foreach my $peer (glob("$mars/resource-$res/data-*")) {
$peer =~ m:/data-(.+):;
$glob .= "," if $count++;
$glob .= $1;
}
$glob .= "}";
}
my %links;
foreach my $path (glob($glob)) {
$path =~ m:/$alive-(.+):;
my $peer = $1;
$links{$peer} = get_link($path);
}
return %links;
}
##################################################################
# timeout handling
#
# return the lamport clock time in nanosecond resolution
# fallback to system time()
#
sub mars_time {
my $lamport_time;
if (open(my $lamport_clock, "<", "/proc/sys/mars/lamport_clock")) {
while (<$lamport_clock>) {
$lamport_time = $1 if /^lamport_now=(.*)/;
}
close($lamport_clock);
}
return $lamport_time || time() . "." . '0' x 9;
}
sub sleep_timeout {
my $sleeptime = shift || 5;
if ($timeout < 0) {
sleep($sleeptime);
return;
}
ldie "Timeout reached. You may retry with --timeout=-1 to ensure waiting until progress is possible.\n" if !$timeout;
my $rest = $timeout;
$rest = $sleeptime if $rest > $sleeptime;
sleep($rest);
$timeout -= $rest;
}
# wait for some condition
sub wait_cond {
my ($cmd, $res, $specific) = @_;
my $is_actual = ($specific =~ s/^(is|has)-//);
my $is_on = !($specific =~ s/-off$//);
$specific =~ s/-on$//;
if ($is_actual) {
if ($specific eq "device") {
check_mars_device($cmd, $res, 1, !$is_on);
return;
}
my %table =
(
"attach" => "is-attached",
"attached" => "is-attached",
"replay" => "is-replaying",
"replaying"=> "is-replaying",
"copy" => "is-copying",
"copying" => "is-copying",
"sync" => "is-syncing",
"syncing" => "is-syncing",
"primary" => "is-primary",
);
my $name = $table{$specific};
ldie "actual indicator '$specific' does not exist\n" unless exists($table{$specific});
check_status($cmd, $res, $name, $is_on ? 1 : 0, 1);
} else {
my %table =
(
"attach" => "attach",
"attached" => "attach",
"connect" => "connect",
"replay" => "allow-replay",
"sync" => "sync",
);
my $name = $table{$specific};
ldie "button '$specific' does not exist\n" unless exists($table{$specific});
check_todo($cmd, $res, $name, $is_on ? 1 : 0, 1);
}
}
# wait until some communication has occurred
sub wait_cluster {
return wait_cond(@_) if int(@_) >= 3;
my $cmd = shift;
my $res = shift || "all";
my $start_time = mars_time();
_trigger();
my $delta = $timeout > 0 ? $timeout : 30;
while (1) {
my $dead_count = 0;
my $alive_count = 0;
my $unknown_count = 0;
my %status = get_alive_links($res, "time");
my $now = mars_time();
foreach my $peer (keys(%status)) {
next if $peer eq $host;
if ($status{$peer} >= $start_time) {
$alive_count++;
} elsif ($status{$peer} + $delta < $now) {
$dead_count++;
} else {
$unknown_count++;
}
}
if (!$dead_count && !$unknown_count) {
lprint "all $alive_count peer(s) seem to be alive\n";
last;
}
if (!$unknown_count) {
lwarn "$alive_count peer(s) seem to be alive, and $dead_count peer(s) seem to be dead / not reachable\n";
ldie "--force not given\n" unless $force;
last;
}
sleep(1);
}
}
##################################################################
# syntactic checks
sub check_id {
my $str = shift;
ldie "identifier '$str' has disallowed characters" unless $str =~ m/^[A-Za-z_][-A-Za-z0-9_]*$/;
ldie "identifier '$str' is too long (only 16 chars allowed)" if length($str) > 16;
}
##################################################################
# semantic checks
sub check_res {
my $res = shift;
if (not -d "$mars/resource-$res") {
# DO WHAT I MEAN: try to substitute a device name for a badly given resource name if it is unique
my $count = 0;
my $found;
my @tests = glob("$mars/resource-*/device-$host");
foreach my $test (@tests) {
my $target = get_link($test, 2);
if ($target eq $res) {
$found = $test;
$count++;
}
}
if (!$count) {
@tests = glob("$mars/resource-*/_direct-*-$host");
foreach my $test (@tests) {
my $target = get_link($test, 2);
$target =~ s/^.*,//;
if ($target eq $res) {
$found = $test;
$count++;
}
}
}
ldie "resource '$res' does not exist ($count replacements found)\n" unless $count == 1 and $found;
$found =~ s:^.*/resource-(.*)/.*$:$1:;
lwarn "substituting bad resource name '$res' by uniquely matching resource name '$found'\n";
$res = $found;
}
return $res;
}
sub check_sizes {
my ($res, $host) = @_;
my $logical_size = get_link("$mars/resource-$res/size");
my $physical_size = get_link("$mars/resource-$res/actsize-$host", 1);
if (defined($physical_size) && $physical_size < $logical_size) {
ldie "physical device on host '$host' has size $physical_size, which is smaller than the logical resource size $logical_size\n";
}
}
sub check_res_member {
my $res = shift;
ldie "sorry, I have not yet joined to resource '$res'\n" unless -e "$mars/resource-$res/data-$host";
check_sizes($res, $host);
}
sub check_sync_finished {
my ($res, $host) = @_;
check_sizes(@_);
my $lnk = "$mars/resource-$res/syncstatus-$host";
if (lstat($lnk)) {
my $syncstatus = get_link($lnk, 1);
my $size = get_link("$mars/resource-$res/size");
ldie "sync has not yet finished, only $syncstatus / $size bytes transferred\n" unless $syncstatus >= $size;
}
lprint "OK, it seems that sync has finished on $host.\n";
}
sub check_primary {
my ($cmd, $res) = @_;
my $lnk = "$mars/resource-$res/actual-$host/is-primary";
my $is_primary = get_link($lnk);
if (!$is_primary) { # give it a second chance
my $name = get_link("$mars/resource-$res/device-$host", 1);
my $dev = "/dev/mars/$name";
$is_primary = 1 if -b $dev;
}
ldie "for operation '$cmd' I need to be primary\n" unless $is_primary;
my $primary = _get_designated_primary($res);
ldie "for operation '$cmd', I also must be the designated primary\n" unless $primary eq $host;
}
sub check_not_primary {
my ($cmd, $res) = @_;
my $lnk = "$mars/resource-$res/actual-$host/is-primary";
my $is_primary = get_link($lnk, 1);
if ($is_primary) {
ldie "operation '$cmd' cannot be executed on primary\n" unless $force;
lwarn "operation '$cmd' is forced on actual primary '$host', THIS IS RISKY\n";
}
# also check whether we intend to become primary
my $primary = _get_designated_primary($res);
ldie "operation '$cmd' cannot be executed on designated primary\n" if $primary eq $host;
}
sub check_primary_gone {
my ($res) = @_;
for (;;) {
my $pri = _get_actual_primary($res);
last if !$pri;
last if $pri eq $host;
lprint "waiting for other primary host ($pri) to disappear....\n";
sleep_timeout();
}
}
sub check_todo {
my ($cmd, $res, $key, $val, $wait, $unchecked, $inv) = @_;
my $path = "$mars/resource-$res/todo-$host/$key";
my $link;
for (;;) {
$link = get_link($path, $unchecked);
return unless defined($link);
if (defined($inv) && $inv) {
last if $link != $val;
ldie "cannot execute $cmd: switch '$key' must not have value '$val'\n" if !$wait;
lprint "waiting until switch '$key' leaves the value '$val'....\n";
} else {
last if $link == $val;
ldie "cannot execute $cmd: switch '$key' must have value '$val', but actually has value '$link'\n" if !$wait;
lprint "waiting until switch '$key' reaches the value '$val'....\n";
}
sleep_timeout();
}
lprint "OK, '$path' has acceptable value '$link'\n";
}
sub check_status {
my ($cmd, $res, $key, $val, $wait, $unchecked, $inv) = @_;
my $path = "$mars/resource-$res/actual-$host/$key";
my $link;
for (;;) {
$link = get_link($path, $unchecked);
return unless defined($link);
if (defined($inv) && $inv) {
last if $link != $val;
ldie "cannot execute $cmd: '$path' must not have value '$val'\n" if !$wait;
lprint "waiting until '$key' leaves the value '$val'...\n";
} else {
last if $link == $val;
ldie "cannot execute $cmd: '$path' must have value '$val'\n" if !$wait;
lprint "waiting until '$key' reaches the value '$val'...\n";
}
sleep_timeout();
}
lprint "OK, '$path' has acceptable value '$link'\n";
}
sub check_mars_device {
my ($cmd, $res, $wait, $inv) = @_;
my $name = get_link("$mars/resource-$res/device-$host", $inv);
my $dev = "/dev/mars/$name";
my $backoff = 1;
my $round = 0;
if ($inv) {
while (-b $dev) {
ldie "cannot execute $cmd: device '$dev' has not yet disappeared\n" if !$wait;
lwarn "device '$dev' has not yet disappeared\n";
sleep_timeout($backoff);
# very slowly increasing backoff
if ($backoff < 10 && $round++ > 5) {
$round = 0;
$backoff++;
}
}
lprint "device '$dev' is no longer present\n" unless -b $dev;
return;
}
# !$inv
my $primary = _get_designated_primary($res);
ldie "for operation '$cmd', I should be the designated primary\n" unless $primary eq $host;
while (! -e $dev) {
my $text = get_error_text($cmd, $res);
lprint $text if $text;
ldie "aborting due to errors\n" if $text =~ m/error/mi;
ldie "cannot execute $cmd: device '$dev' not yet present\n" if !$wait;
lprint "device '$dev' not yet present\n";
sleep_timeout($backoff);
# very slowly increasing backoff
if ($backoff < 10 && $round++ > 5) {
$round = 0;
$backoff++;
}
}
lprint "device '$dev' is present\n" if -b $dev;
}
sub check_userspace {
my ($dst) = @_;
if ($dst !~ m:/userspace/:) {
ldie "your path '$dst' must be inside $mars/userspace/ or $mars/resource-*/userspace/\n" unless $force;
lwarn "your path '$dst' is outside $mars/userspace/ or $mars/resource-*/userspace/ and you gave --force, hopefully YOU KNOW WHAT YOU ARE DOING\n";
}
}
##################################################################
# state inspection routines
sub _get_minmax {
my ($res, $glob, $take_symlink) = @_;
my $min = -1;
my $max = -1;
my @paths = glob($glob) or ldie "cannot find '$glob'\n";
foreach my $path (@paths) {
my $nr = $path;
if ($take_symlink) {
$nr = get_link($path);
}
$nr =~ s:^.*[a-z]+-([0-9]+)(-[^/]*)?$:$1:;
$min = $nr if ($nr < $min || $min < 0);
$max = $nr if ($nr > $max || $max < 0);
}
return ($min, $max);
}
sub get_minmax_logfiles {
my ($res) = @_;
return _get_minmax($res, "$mars/resource-$res/log-*", 0);
}
sub get_minmax_versions {
my ($res) = @_;
return _get_minmax($res, "$mars/resource-$res/version-*", 0);
}
sub get_minmax_any {
my ($res) = @_;
return _get_minmax($res, "$mars/resource-$res/{log,version}-*", 0);
}
sub get_minmax_replays {
my ($res) = @_;
return _get_minmax($res, "$mars/resource-$res/replay-*", 1);
}
##################################################################
# versionlink path handling routines
my %visited_pos;
sub _parse_pos {
my ($pos, $do_remember) = @_;
$pos =~ m/((?:log|version)-([0-9]+)-([^,]+)(?:,([0-9]+))?)/ or ldie "cannot parse '$pos'\n";
if ($do_remember) {
my $visit = "$2,$3";
$visited_pos{$visit} = 1;
}
return ($1, int($2), $3, defined($4) ? int($4) : -1);
}
sub _get_prev_pos {
my ($basedir, $nr, $host) = @_;
my $path = sprintf("$basedir/version-%09d-$host", $nr);
my $vers = get_link($path, 1);
$vers =~ s/^.*://;
return $vers;
}
sub _get_common_ancestor {
# TODO: recursive formulation, improve efficiency
my ($basedir, $pos1, $host1, $dep1, $pos2, $host2, $dep2) = @_;
my ($p1, $nr1, $from1, $len1) = _parse_pos($pos1, 0);
my ($p2, $nr2, $from2, $len2) = _parse_pos($pos2, 0);
if ($p1 eq $p2) {
# usually no split brain here (only if both path depths are non-zero)
my $split = ($dep1 && $dep2);
if (!$split) { # additionally check the corresponding version links
my $path1 = sprintf("$basedir/version-%09d-$from1", $nr1);
my $path2 = sprintf("$basedir/version-%09d-$from2", $nr2);
if (my $vers1 = get_link($path1) and my $vers2 = get_link($path2)) {
$split = 1 if $vers1 ne $vers2;
}
}
return ($p1, $split);
} elsif ($nr1 > $nr2) {
# just flip arguments
return _get_common_ancestor($basedir, $pos2, $host2, $dep2, $pos1, $host1, $dep1);
} elsif ($nr1 < $nr2) {
# recursively advance path depth
my $vers2 = _get_prev_pos($basedir, $nr2, $host2);
return ("", -1) if !$vers2;
return _get_common_ancestor($basedir, $pos1, $host1, $dep1, $vers2, $host2, $dep2 + 1);
} elsif ($from1 ne $from2) {
# split brain is sure now, but continue computing the common split point
my $vers1 = _get_prev_pos($basedir, $nr1, $host1);
return ("", 1) if !$vers1;
my $vers2 = _get_prev_pos($basedir, $nr2, $host2);
return ("", 1) if !$vers2;
my ($res, $split) = _get_common_ancestor($basedir, $vers1, $host1, $dep1 + 1, $vers2, $host2, $dep2 + 1);
return ($res, 1);
} elsif ($len1 < $len2) {
# there may be no split brain (just incomplete replay) depending on path depth
return ($p1, $dep1);
} elsif ($len2 < $len1) {
# dto symmetric
return ($p2, $dep2);
}
lwarn "error in algorithm: $p1, $nr1, $from1, $len1 : $p2, $nr2, $from2, $len2\n";
return ("", -1);
}
sub get_common_ancestor {
my ($basedir, $host1, $host2) = @_;
my $repl1 = get_link("$basedir/replay-$host1", 1);
my $repl2 = get_link("$basedir/replay-$host2", 1);
return _get_common_ancestor($basedir, $repl1, $host1, 0, $repl2, $host2, 0);
}
sub detect_splitbrain {
my ($res, $do_report) = @_;
my $basedir = "$mars/resource-$res";
my $ok = 1;
my @list = glob("$mars/resource-$res/replay-*");
my @hosts = map { $_ =~ s:.*/replay-::; $_ } @list;
foreach my $host1 (@hosts) {
foreach my $host2 (@hosts) {
next if $host1 ge $host2;
my ($res, $split) = get_common_ancestor($basedir, $host1, $host2);
if ($split) {
$ok = 0;
if ($do_report) {
lwarn "SPLIT BRAIN at '$res' detected: hostA = '$host1', hostB = '$host2'\n";
} else {
return $ok;
}
}
}
}
return $ok;
}
sub _mark_path_backward {
my ($basedir, $pos) = @_;
for (;;) {
my ($p, $nr, $from, $len) = _parse_pos($pos, 1);
$pos = _get_prev_pos($basedir, $nr, $from);
last if !$pos;
}
}
sub _mark_path_forward {
my ($basedir, $pos) = @_;
my @list = ($pos);
while (@list) {
my %next_list;
foreach $pos (@list) {
my ($p, $nr, $from, $len) = _parse_pos($pos, 1);
my $next = sprintf("$basedir/version-%09d-*", $nr + 1);
my @candidates = glob($next);
foreach my $cand (@candidates) {
my $vers = get_link($cand, 1);
$vers =~ s/^.*://;
my ($cp, $cnr, $cfrom, $clen) = _parse_pos($vers, 0);
if (int($cnr) == int($nr) && $cfrom eq $from && $clen == $len) {
$next_list{$cand} = 1;
}
}
}
@list = keys(%next_list);
}
}
sub _mark_path_transitive {
_mark_path_forward(@_);
_mark_path_backward(@_);
}
sub log_purge_res {
my ($cmd, $res) = @_;
my $basedir = "$mars/resource-$res";
my @files = glob("$basedir/{log,version}-*");
foreach my $replay (glob("$basedir/replay-*")) {
my $target = get_link($replay, 1);
_mark_path_transitive($basedir, $target);
}
foreach my $file (@files) {
$file =~ m:/((log|version)-([0-9]+)-([^,]+)): or ldie "bad path '$file'\n";
my $visit = "$3,$4";
lprint "checking '$1'\n";
next if $visited_pos{$visit};
_create_delete($file);
}
finish_links();
_wait_delete();
}
sub try_to_avoid_splitbrain {
my ($cmd, $res) = @_;
my ($min, $max) = get_minmax_versions($res);
my @host_list = glob("$mars/resource-$res/replay-*");
return if scalar(@host_list) < 2;
my $vers_glob = sprintf("$mars/resource-$res/version-%09d-*", $max);
for (;;) {
my $ok = 1;
my @versions = glob($vers_glob);
my $first = get_link(shift @versions);
while (@versions) {
my $next = get_link(shift @versions);
if ($next ne $first) {
$ok = 0;
}
}
last if $ok;
lprint "trying to avoid split brain: logfile update not yet completed.\n";
sleep_timeout();
}
}
sub get_size {
my $arg = shift || "";
if (!($arg =~ m/^([0-9]+(?:\.[0-9]*)?)([kmgtp]?)$/i)) {
ldie "size argument '$arg' must be a number, optionally followed by suffix k or m or g or t or p\n";
}
my $mod = $2 || "";
$arg = $1;
$_ = $mod;
SWITCH: {
/k/i and $arg *= 1024, last SWITCH;
/m/i and $arg *= 1024 * 1024, last SWITCH;
/g/i and $arg *= 1024 * 1024 * 1024, last SWITCH;
/t/i and $arg *= 1024 * 1024 * 1024 * 1024, last SWITCH;
/p/i and $arg *= 1024 * 1024 * 1024 * 1024 * 1024, last SWITCH;
}
ldie "size '$arg' is not a multiple of 4k\n" if ($arg % 4096) != 0;
return $arg;
}
#
# Get actual primary node from links below actual-*/ subdirs
#
sub _get_actual_primary {
my ($res) = @_;
my @primary_links = glob("$mars/resource-$res/actual-*/is-primary");
my $primary;
foreach my $link (@primary_links) {
if (my $val = get_link($link)) {
$primary = ($link =~ qr%.*actual-([^/]+)/is-primary%)[0];
last;
# Note: if there are more than one 'is-primary' links (an insane state anyway),
# the first 'is-primary' link is selected. Other links are ignored.
}
}
return $primary;
}
sub _get_designated_primary {
my ($res) = @_;
return get_link("$mars/resource-$res/primary");
}
sub get_peers {
my ($res) = @_;
my @list = glob("$mars/resource-$res/data-*");
return map { $_ =~ s:$mars/resource-$res/data-::; $_ } @list;
}
sub __conv_tv {
my ($tv_sec, $tv_nsec) = @_;
my ($sec,$min,$hour,$mday,$mon,$year,$wday,$yday,$isdst) = localtime(int($tv_sec));
return "$tv_sec.$tv_nsec" unless defined($sec);
return sprintf("%04d-%02d-%02d %02d:%02d:%02d.%s", $year+1900, $mon + 1, $mday, $hour, $min, $sec, $tv_nsec);
}
sub _replace_timestamps {
my ($txt) = @_;
$txt =~ s:([0-9]{9,99})\.([0-9]{9}):__conv_tv($1,$2):ge;
return $txt;
}
sub _get_text {
my ($path, $regex, $do_print) = @_;
open(IN, "<", $path) or return "";
my $text = "";
my $count = 0;
while (my $line = <IN>) {
# use regex e.g. for fetching only errors and warnings
if (!$regex || $line =~ $regex) {
$line = _replace_timestamps($line);
$count++;
if ($do_print) {
print $line;
} else {
$text .= $line;
}
}
}
close(IN);
return $count if $do_print;
return $text;
}
sub get_error_text {
my ($cmd, $res) = @_;
my $text = _get_text("$mars/resource-$res/logstatus-$host.status", "m/^(err|warn)/i", 0);
return $text;
}
##################################################################
# helpers
sub _trigger {
system("(echo 2 > /proc/sys/mars/trigger) >/dev/null 2>&1");
}
sub _switch {
my ($cmd, $res, $path, $on) = @_;
my $src = $on ? "1" : "0";
my $old = get_link($path);
if ($old && $old eq $src) {
lprint "${cmd} on resource $res is already activated\n" if $cmd;
return;
}
set_link($src, $path);
lprint "successfully started ${cmd} on resource $res\n" if $cmd;
}
sub _writable {
my ($path, $on) = @_;
my $oldmode = (lstat $path)[2] & 0700;
my $newmode = $on ? $oldmode | 0200 : $oldmode & ~0200;
lprint "chmod '$path' $oldmode $newmode";
chmod($newmode, $path) == 1 or ldie "cannot chmod '$path'\n";
}
sub _get_ip {
my $ip_path = "$mars/ips/ip-$host";
if (my $from_link = get_link($ip_path, 2)) {
lprint_stderr "Using IP '$from_link' from '$ip_path'\n";
return $from_link;
}
chomp (my @info = `/sbin/ip addr show`);
my $interface = "";
foreach my $line (@info) {
$interface = $1 if $line =~ m#^[0-9]+:\s([a-zA-Z_0-9]+):#;
next if $interface eq "lo";
if ($line =~ m#\sinet\s(\d+\.\d+\.\d+\.\d+)#) {
my $from_if = $1;
lprint_stderr "Using IP '$from_if' from interface '$interface'\n";
return $from_if;
}
}
return undef;
}
sub _fake_versionlink {
my ($basedir, $log_nr, $primary) = @_;
for (my $rounds = 2; $rounds > 0; $rounds--) {
my $new_version = sprintf("$basedir/version-%09d-$host", $log_nr);
my $pri_version = sprintf("$basedir/version-%09d-$primary", $log_nr);
if ($primary eq $host) {
ldie "Cannot fake my own version link '$new_version'\n";
}
my $pri_link = get_link($pri_version, 1);
if (!$pri_link) {
lwarn "cannot read symlink '$pri_version' -- cannot create faked versionlink '$pri_version'\n";
$log_nr++;
next;
}
lprint "creating new version symlink '$new_version' -> '$pri_link'\n";
set_link($pri_link, $new_version);
last;
}
}
sub _set_replaylink {
my ($basedir, $log_nr, $primary, $msg) = @_;
$msg = " -- THIS IS EXTREMELY RISKY -- any inconsistencies are on your own!" unless defined($msg);
ldie "no designated primary defined\n" unless ($primary && $primary ne "(none)");
my $rep_path = "$basedir/replay-$host";
my $rep_val = sprintf("log-%09d-$primary,0,0", $log_nr);
lprint "creating new replaylink '$rep_path' -> '$rep_val'\n";
set_link($rep_val, $rep_path);
if ($log_nr > 1) {
_fake_versionlink($basedir, $log_nr - 1, $primary);
} else {
my $initial;
for (;;) {
$initial = get_link("$basedir/version-000000001-$primary", 1);
last if $initial;
sleep_timeout();
}
set_link($initial, "$basedir/version-000000001-$host");
}
set_link("$log_nr$msg", "$basedir/skip-check-$host");
}
##################################################################
# commands
sub ignore_cmd {
my ($cmd, $res) = @_;
lprint "ignoring command '$cmd' on resource '$res'\n";
}
sub senseless_cmd {
my ($cmd, $res) = @_;
lprint "command '$cmd' makes no sense with MARS (ignoring on resource '$res')\n";
}
sub forbidden_cmd {
my ($cmd, $res) = @_;
ldie "command '$cmd' on resource '$res' cannot be used with MARS (migth affect too many hosts, lead to undesired consequences)\n";
}
sub nyi_cmd {
my ($cmd, $res) = @_;
ldie "command '$cmd' on resource '$res' is not yet implemented\n";
}
sub is_module_loaded {
open TEST, "lsmod | grep mars |";
my $res = <TEST>;
close TEST;
return $res;
}
sub set_link_cmd {
my $cmd = shift;
for (;;) {
my $src = shift || last;
my $dst = shift || ldie "you did not supply a symlink destination for source '$src'\n";
ldie "symlink target '$dst' is not an absolute path\n" unless $dst =~ m:^/:;
check_userspace($dst);
my $dir = `dirname "$dst"` or ldie "path '$dst' has no dirname\n";
chomp $dir;
ldie "directory '$dir' does not exist\n" unless -d $dir;
set_link($src, $dst);
}
}
sub _create_cluster {
my ($cmd) = @_;
system("mkdir $mars") unless -d $mars;
my $old_uuid = get_link("$mars/uuid", 2);
if ($cmd eq "create-cluster") {
ldie "cluster was already created with uuid='$old_uuid'\n" if $old_uuid && !$force;
my $uuid = `echo -n \$(hostname) \$(date)`;
set_link($uuid, "$mars/uuid");
finish_links(); # opportunity for errors => don't continue
} elsif (!$old_uuid && !$force) {
if ($user_version == 0.1) {
my $uuid = `echo -n \$(hostname) \$(date)`;
set_link($uuid, "$mars/uuid");
}
ldie "cluster has no uuid\n" if $user_version > 0.1;
}
system("mkdir $mars/ips") unless -d "$mars/ips";
system("mkdir $mars/userspace") unless -d "$mars/userspace";
system("mkdir $mars/defaults") unless -d "$mars/defaults";
system("mkdir $mars/defaults-$host") unless -d "$mars/defaults-$host";
system("mkdir $mars/todo-global") unless -d "$mars/todo-global";
set_link($ip, "$mars/ips/ip-$host");
set_link("1", "$mars/todo-global/deleted-$host");
}
sub create_cluster {
my ($cmd, $peer) = @_;
ldie "cluster is already created\n" if !$force && -d "$mars/ips";
ldie "mars module is loaded, please unload first\n" if is_module_loaded();
_create_cluster(@_);
}
sub join_cluster {
my ($cmd, $peer) = @_;
if (glob("$mars/resource-*") or glob("$mars/ips/*")) {
ldie "Sorry, some resources already exist!\nThis is dangerous!\nIf you are sure that no resource clash is possible, re-invoke this command with '--force' option\n" unless $force;
}
ldie "mars module is loaded, please unload first\n" if is_module_loaded();
lprint "joining cluster via rsync (peer='$peer')\n";
# check connection
system("ssh $peer uname -a") == 0 or ldie "oops, no connection to $peer ...\n";
mkdir($mars) unless -d $mars;
unless ($dry_run) {
system("rsync --recursive --links --max-size=1 -v $peer:$mars/ $mars/") == 0 or ldie "cannot get remote symlink tree via rsync\n";
}
_create_cluster(@_);
finish_links();
unless ($dry_run) {
system("rsync --recursive --links -v $mars/ips/ $peer:$mars/ips/") == 0 or ldie "oops\n";
}
}
sub leave_cluster {
my ($cmd) = @_;
my $check = "/mars/resource-*/*-$host";
ldie "I am member of some resources\n" if glob($check) && !$force;
_create_delete("$mars/ips/$host");
}
sub create_res {
my ($cmd, $res, $dev, $appear, $size_arg) = @_;
my $create = ($cmd eq "create-resource");
ldie "undefined device or size argument\n" unless $dev;
$appear = $res if !$appear;
check_id($appear) if $appear;
my $resdir = "$mars/resource-$res";
if ($create) {
if (-d $resdir) {
lwarn "resource directory '$res' already exists\n";
my @host_list = glob("$resdir/replay-*");
if (@host_list) {
my $h_list = join(',', map({ $_ =~ s:.*/replay-::;} (@host_list)));
lwarn "DANGER: hosts '$h_list' are already member of resource '$res'.\n";
ldie "REFUSING to trash your resource!\n" unless $force;
}
}
lprint "creating new resource '$res'\n";
} else {
if ( -e "$resdir/data-$host") {
lwarn "resource '$res' has been already joined -- this is dangerous!\n";
ldie "refusing dangerous operation\n" unless $force;
} else {
lprint "joining to existing resource '$res'\n";
}
}
my $size = 0;
if (-b $dev) {
ldie "block device '$dev' must be an absolute path starting with '/'\n" unless $dev =~ m/^\//;
use Fcntl 'SEEK_END', 'O_RDONLY', 'O_RDWR', 'O_EXCL';
my $flags = O_RDWR | O_EXCL;
if ($force) {
$flags = O_RDONLY;
}
sysopen(TEST, $dev, $flags) or ldie "cannot open device '$dev' for exclusive rw access\n";
$size = sysseek(TEST, 0, SEEK_END);
close(TEST);
lprint "block device '$dev': determined size = $size bytes\n";
if ($size_arg) {
my $new_size = get_size($size_arg);
ldie "size argument '$size_arg' is smaller than device size '$size'\n" unless $new_size <= $size;
lprint "reducing size from $size to $new_size\n";
$size = $new_size;
}
} else {
$size = get_size($dev);
if ($size > 0) {
$dev = "";
} else {
ldie "bad parameter '$dev': must be either a block device name, or size followed by k or m or g or t\n";
}
}
ldie "implausible size $size" unless $size > 4096 * 16; # smaller floppies should not exist ;)
my $primary;
my $replay_nr = -1;
if ($create) {
mkdir($resdir);
ldie "could not create resource '$res'\n" unless -d $resdir;
set_link($size, "$resdir/size");
} else { # join
ldie "resource '$res' does not exist\n" unless -d $resdir;
my $res_size = get_link("$mars/resource-$res/size");
if ($size < $res_size) {
lwarn "size of new device is only $size, but should be $res_size\n";
ldie "refusing to join due to bad size\n" unless $force;
} elsif ($size > $res_size) {
lprint "Your physical device has size $size, which is larger than the logical resource size $res_size.\n";
lprint "This does no harm, but you are wasting some space.\n";
}
$primary = _get_designated_primary($res);
ldie "Sorry, joining is only possible if a designated primary exists.\n" if $primary eq "(none)";
ldie "implausible state: I ($host) am already designated primary of resource '$res' which I just wanted to join\n" if $primary eq $host;
ldie "my ip '$ip' is not registered -- please run 'join-cluster' first\n" unless -l "$mars/ips/ip-$host";
my $replay = get_link("$resdir/replay-$primary");
if ($replay =~ m/^log-([0-9]+)-/) {
$replay_nr = $1;
} else {
ldie "cannot determine current logfile number.\n";
}
}
# check for uniqeness of $appear
if ($appear) {
foreach my $old_dev (glob("$mars/resource-*/device-$host")) {
$old_dev =~ m:/resource-([^/]+)/:;
next unless defined($1);
my $old_res = $1;
next if $old_res eq $res;
my $old_name = get_link($old_dev);
if ($old_name eq $appear) {
if ( -e "$mars/resource-$old_res/data-$host") {
ldie "device '/dev/mars/$old_name' is already present in joined resource '$old_res'\n";
} else {
lwarn "device '/dev/mars/$old_name' is already present in another unjoined resource '$old_res' -- this does no harm, but may be confusing.\n";
}
}
}
# warn if devices are named differently throughout the cluster
foreach my $old_dev (glob("$resdir/device-*")) {
my $old_name = get_link($old_dev);
if ($old_name ne $appear) {
$old_dev =~ m:/device-(.+)$:;
my $old_host = $1;
lwarn "your name '/dev/mars/$appear' differs from '/dev/mars/$old_name' on host '$old_host'.";
lwarn "this does no harm, but may be confusing.";
}
}
}
my $file = "$resdir/data-$host";
if (!$dev) {
lwarn "file '$file' already exists - reusing\n" if -e $file;
lprint "setup sparse file '$file' with size $size\n";
open(OUT, ">>", $file) or ldie "could not open '$file'\n";
truncate(OUT, $size) or ldie "truncate to size $size failed\n";
close OUT;
} else {
lprint "using existing device '$dev'\n";
set_link($dev, $file);
}
if ($appear) {
lprint "resource '$res' will appear as local device '/dev/mars/$appear'\n";
set_link($appear, "$resdir/device-$host");
}
mkdir("$resdir/userspace") unless -d "$resdir/userspace";
mkdir("$resdir/defaults") unless -d "$resdir/defaults";
mkdir("$resdir/defaults-$host");
mkdir("$resdir/local-$host");
mkdir("$resdir/actual-$host");
my $todo = "$resdir/todo-$host";
mkdir($todo);
set_link("1", "$todo/attach");
set_link("1", "$todo/connect");
set_link("1", "$todo/sync");
set_link("1", "$todo/allow-replay");
unlink("$resdir/syncstatus-$host");
if ($create) {
set_link($host, "$resdir/primary");
set_link($size, "$resdir/syncstatus-$host");
my $startnr = get_link("$resdir/maxnr", 2);
if (defined($startnr) && $startnr > 0) {
$startnr += 1000;
} else {
$startnr = 1;
}
my $fmt = sprintf("%09d", $startnr);
set_link("log-$fmt-$host,0,0", "$resdir/replay-$host");
system("touch $resdir/log-$fmt-$host") unless $dry_run;
set_link("00000000000000000000000000000000,log-$fmt-$host,0:", "$resdir/version-$fmt-$host");
set_link("$startnr", "$resdir/maxnr");
finish_links();
lprint "successfully created resource '$res'\n";
} else {
_set_replaylink($resdir, $replay_nr, $primary, "");
set_link("0", "$resdir/syncstatus-$host");
finish_links();
lprint "successfully joined resource '$res'\n";
}
}
sub leave_res_phase0 {
my ($cmd, $res) = @_;
check_not_primary(@_);
my $errors = 0;
foreach my $tmp (glob("$mars/resource-$res/todo-$host/*")) {
my $status = get_link($tmp, 2);
if ($status) {
lwarn "switch '$tmp' is not off\n";
$errors++;
}
}
if (!$force) {
foreach my $tmp (glob("$mars/resource-$res/actual-$host/{is-,logfile-}*")) {
my $status = get_link($tmp);
if ($status) {
lwarn "running status '$tmp' is not off\n";
$errors++;
}
}
check_status($cmd, $res, "is-attached", 0, 0, 1);
ldie "there were $errors errors.\n" if $errors;
}
}
sub leave_res_phase1 {
my ($cmd, $res) = @_;
_create_delete("$mars/resource-$res/replay-$host");
_create_delete("$mars/resource-$res/data-$host");
_create_delete("$mars/resource-$res/syncstatus-$host");
_create_delete("$mars/resource-$res/device-$host");
_create_delete("$mars/resource-$res/actsize-$host");
foreach my $dir (glob("$mars/resource-$res/*-$host/")) {
foreach my $tmp (glob("${dir}*")) {
_create_delete($tmp);
}
_create_delete($dir);
}
finish_links();
}
# wait for deletions (avoid races with following commands)
sub leave_res_phase2 {
my ($cmd, $res) = @_;
_wait_delete();
log_purge_res(@_);
}
sub delete_res {
my ($cmd, $res) = @_;
my $basedir = "$mars/resource-$res";
# preconditions
if (! -d $basedir) {
lprint "resource directory '$basedir' does no longer exist.\n";
return;
}
my @host_list = glob("$basedir/replay-*");
my $cnt = scalar(@host_list);
if ($cnt > 0) {
my $h_list = join(',', map({ $_ =~ s:.*/replay-::;} (@host_list)));
ldie "resource '$res' is not empty: first remove the hosts '$h_list' via leave-resource\n" unless $force;
lwarn "BRUTE FORCE resource destruction: '$res' has $cnt members ($h_list) THESE ARE FINALLY TRASHED right now -- you are RESPONSIBLE for any subsequent problems.\n";
}
for my $path (`find $basedir/* | sort -r`) {
chomp $path;
next if $path =~ m:/maxnr$:;
_create_delete($path);
}
finish_links();
_wait_delete();
}
sub logrotate_res {
my ($cmd, $res) = @_;
check_primary(@_);
my @paths = glob("$mars/resource-$res/log-*-$host") or ldie "cannot find any logfiles\n";
@paths = sort(@paths);
my $last = pop(@paths);
if (-z $last) {
lprint "an empty logfile '$last' already exists, nothing to do.\n";
return;
}
my $nr = $last;
$nr =~ s/^.*log-([0-9]+)-.+$/$1/;
my $next = sprintf("$mars/resource-$res/log-%09d-$host", $nr + 1);
ldie "logfile '$next' already exists\n" if -e $next;
system("touch $next") unless $dry_run;
my $startnr = get_link("$mars/resource-$res/maxnr", 1);
$startnr = $nr + 1 if ($nr >= $startnr);
set_link("$startnr", "$mars/resource-$res/maxnr");
}
sub _get_deletable_logfiles {
my ($cmd, $res) = @_;
my $min = -1;
my $max = -1;
my @log_paths = glob("$mars/resource-$res/log-*") or ldie "cannot find any logfiles\n";
foreach my $path (@log_paths) {
$path =~ m/\/log-([0-9]+)-/;
my $nr = $1;
$min = $nr if ($nr < $min || $min < 0);
$max = $nr if ($nr > $max || $max < 0);
}
my @paths = glob("$mars/resource-$res/replay-*") or ldie "cannot find any replay symlinks\n";
foreach my $path (@paths) {
my $target = get_link($path);
$target =~ m/^log-([0-9]+)/;
my $nr = $1;
$max = $nr if ($nr < $max || $max < 0);
}
lprint "min deletable logfile number: $min\n";
lprint "min non-deletable logfile number: $max\n";
return ($min, $max);
}
my $delete_nr = -1;
sub _create_delete {
my ($target) = @_;
ldie "cannot delete: '$target' is no absolute path\n" unless $target =~ m:^/:;
if ($delete_nr < 0) { # compute only upon first call
my @paths = glob("$mars/todo-global/delete-*");
foreach my $path (@paths) {
$path =~ m/-([0-9]+)/;
if (defined($1) && $1 > $delete_nr) {
$delete_nr = $1;
}
}
my @paths2 = glob("$mars/todo-global/deleted-*");
foreach my $path (@paths2) {
my $link = get_link($path, 1);
$link =~ m/([0-9]+)/;
if (defined($1) && $1 > $delete_nr) {
$delete_nr = $1;
}
}
}
my $new = sprintf("$mars/todo-global/delete-%09d-$host", ++$delete_nr);
lprint "create symlink $new -> $target\n";
set_link($target, $new);
}
sub _wait_delete {
return if $dry_run;
for (;;) {
my $deleted = get_link("$mars/todo-global/deleted-$real_host");
last if $deleted >= $delete_nr;
lprint "waiting for deletions to apply locally....\n";
sleep_timeout();
}
}
sub delete_file_cmd {
my $cmd = shift;
my $res = shift; # ignore this
foreach my $path (@_) {
check_userspace($path);
_create_delete($path);
}
}
sub logdelete_res {
my ($cmd, $res) = @_;
my @paths = glob("$mars/resource-$res/log-*") or ldie "cannot find any logfiles\n";
@paths = sort(@paths);
my ($min_deletable, $max_deletable) = _get_deletable_logfiles(@_);
if ($min_deletable >= $max_deletable) {
lprint "no logfiles are deletable.\n";
return;
}
if ($cmd ne "log-delete-all") {
$max_deletable = $min_deletable + 1; # delete only the first one
}
my $nr = 0;
for (;;) {
my $first = shift(@paths);
last unless $first;
$nr = $first;
$nr =~ s/^.*log-([0-9]+)-.+$/$1/;
next unless $nr < $max_deletable;
lprint "chosen '$first' for deletion\n";
_create_delete($first);
}
lprint "removing left-over version symlinks...\n";
foreach my $versionlink (glob("$mars/resource-$res/version-*")) {
my $nrv = $versionlink;
$nrv =~ s/^.*\/version-([0-9]+)-.+$/$1/;
# we need at least one more version link than logfiles for consistency checks
next unless $nrv < $max_deletable - 1;
_create_delete($versionlink);
}
}
sub attach_res_phase0 {
my ($cmd, $res) = @_;
return if $force;
my $detach = ($cmd eq "detach");
if ($detach) {
my $device_in_use = get_link("$mars/resource-$res/actual-$host/open-count", 1);
if ($device_in_use) {
my $name = get_link("$mars/resource-$res/device-$host");
my $dev = "/dev/mars/$name";
ldie "device '$dev' is in use\n";
}
}
}
sub attach_res_phase1 {
my ($cmd, $res) = @_;
my $detach = ($cmd eq "detach");
my $path = "$mars/resource-$res/todo-$host/attach";
_switch($cmd, $res, $path, !$detach);
}
sub attach_res_phase2 {
my ($cmd, $res) = @_;
my $detach = ($cmd eq "detach");
check_status($cmd, $res, "is-attached", $detach ? 0 : 1, 1);
if ($detach) {
check_mars_device($cmd, $res, 1, 1);
check_status($cmd, $res, "is-replaying", 0, 1);
check_status($cmd, $res, "is-syncing", 0, 1);
}
}
sub connect_global_res {
my ($cmd, $res) = @_;
my $disconnect = ($cmd =~ m/disconnect/);
my @paths = glob("$mars/resource-$res/todo-*/");
for my $path (@paths) {
_switch($cmd, $res, "$path/connect", !$disconnect);
}
}
sub connect_local_res {
my ($cmd, $res) = @_;
my $disconnect = ($cmd =~ m/disconnect/);
my $path = "$mars/resource-$res/todo-$host/connect";
_switch($cmd, $res, $path, !$disconnect);
}
sub pause_sync_global_res {
my ($cmd, $res) = @_;
my $pause = ($cmd =~ m/pause/);
my @paths = glob("$mars/resource-$res/todo-*/");
for my $path (@paths) {
_switch($cmd, $res, "$path/sync", !$pause);
}
}
sub pause_sync_local_res {
my ($cmd, $res) = @_;
my $pause = ($cmd =~ m/pause/);
my $path = "$mars/resource-$res/todo-$host/sync";
_switch($cmd, $res, $path, !$pause);
}
sub pause_replay_global_res {
my ($cmd, $res) = @_;
my $pause = ($cmd =~ m/pause/);
my @paths = glob("$mars/resource-$res/todo-*/");
for my $path (@paths) {
_switch($cmd, $res, "$path/allow-replay", !$pause);
}
}
sub pause_replay_local_res {
my ($cmd, $res) = @_;
my $pause = ($cmd =~ m/pause/);
my $path = "$mars/resource-$res/todo-$host/allow-replay";
_switch($cmd, $res, $path, !$pause);
}
sub up_res_phase0 {
my ($cmd, $res) = @_;
my $down = ($cmd eq "down");
if ($down) {
attach_res_phase0("detach", $res);
} else {
attach_res_phase0("attach", $res);
}
}
sub up_res_phase1 {
my ($cmd, $res) = @_;
my $down = ($cmd eq "down");
if ($down) {
pause_replay_local_res("pause-replay-local", $res);
pause_sync_local_res("pause-sync-local", $res);
connect_local_res("disconnect", $res);
attach_res_phase1("detach", $res);
} else {
attach_res_phase1("attach", $res);
connect_local_res("connect", $res);
pause_sync_local_res("resume-sync-local", $res);
pause_replay_local_res("resume-replay-local", $res);
}
}
sub up_res_phase2 {
my ($cmd, $res) = @_;
my $down = ($cmd eq "down");
if ($down) {
attach_res_phase2("detach", $res);
} else {
attach_res_phase2("attach", $res);
}
}
sub set_replay_res {
my ($cmd, $res, $new_nr) = @_;
if (!$new_nr || $new_nr <= 0) {
ldie "you must supply a numeric logfile number as third argument.\n";
}
check_not_primary(@_);
check_todo($cmd, $res, "allow-replay", 0, 0);
my $replaylink = "$mars/resource-$res/replay-$host";
my $old_val = get_link($replaylink);
my $old_nr = $old_val;
$old_nr =~ s/log-([0-9]+)-.*/$1/;
ldie "old log number '$old_nr' is wrong\n" unless $old_nr > 0;
if ($new_nr > $old_nr) {
lwarn "you try to skip logfile numbers from $old_nr to $new_nr, are you sure?\n";
ldie "you would need --force if you really know what you are doing.\n" unless $force;
}
my $primary = _get_designated_primary($res);
_set_replaylink("$mars/resource-$res", $new_nr, $primary);
}
sub fake_local_res {
my ($cmd, $res) = @_;
my $path = "$mars/resource-$res/todo-$host/sync";
_switch($cmd, $res, $path, 0);
#check_status($res, "copy-syncstatus-$host", 0);
my $size = get_link("$mars/resource-$res/size");
my $target = "$mars/resource-$res/syncstatus-$host";
set_link($size, $target);
}
sub _primary_res {
my ($res, $new, $old) = @_;
my $pri = "$mars/resource-$res/primary";
set_link($new, $pri);
lprint "designated primary changed from '$old' to '$new'\n";
}
# check whether primary/secondary switching is possible at all
sub primary_phase0 {
my ($cmd, $res) = @_;
ldie "cannot switch primary: mars kernel module is not loaded\n" unless ($cmd eq "secondary" || -d "/proc/sys/mars");
if ($force) {
lwarn "You can do a '$cmd --force' only in DISCONNECTED state.\n";
check_todo($cmd, $res, "connect", 0, 0);
}
if ($cmd eq "primary") {
check_sync_finished($res, $host);
check_todo($cmd, $res, "attach", 1, 0);
check_todo($cmd, $res, "connect", 1, 0) if !$force;
check_todo($cmd, $res, "allow-replay", 1, 0);
}
my $old = _get_designated_primary($res);
return if ($old eq $host and $cmd eq "primary");
return if $old eq "(none)";
my $open_count_path = "$mars/resource-$res/actual-$old/open-count";
my $device_in_use = get_link($open_count_path, 1);
if ($device_in_use) {
my $name = get_link("$mars/resource-$res/device-$old", 1) || "unknown";
lwarn "device '/dev/mars/$name' for resource '$res' is $device_in_use times in use on primary host '$old'\n";
ldie "first you must umount/close the device (on host '$old')\n" unless $force;
lwarn "First you SHOULD umount/close the device (on host '$old'), but you ignore this recommendation by giving the --force option.\n";
if (is_link_recent($open_count_path)) {
lwarn "You are forcing a SPLIT BRAIN via --force right now. Do you know that this is an ERRONEOUS state? Do you really know what you are doing?\n";
} else {
lwarn "You may produce a SPLIT BRAIN via --force because the peer host '$old' is currently not reachable.\n";
}
}
lprint "all preconditions OK for resource '$res'\n";
}
# when necessary, switch to secondary (intermediately)
sub primary_phase1 {
my ($cmd, $res) = @_;
return if ($force and $cmd eq "primary");
my $old = _get_designated_primary($res);
return if ($old eq $host and $cmd eq "primary");
my $new = "(none)";
return if $old eq $new;
_primary_res($res, $new, $old);
}
# when necessary, wait
sub primary_phase2 {
my ($cmd, $res) = @_;
return if $force;
return unless $cmd eq "primary";
check_primary_gone($res);
my $ok = detect_splitbrain($res);
try_to_avoid_splitbrain(@_) if $ok;
}
# when necessary, switch to primary
sub primary_phase3 {
my ($cmd, $res) = @_;
return unless $cmd eq "primary";
my $old = _get_designated_primary($res);
my $new = $host;
_primary_res($res, $new, $old);
}
# wait for device to appear / disappear
sub primary_phase4 {
my ($cmd, $res) = @_;
if($cmd eq "secondary") {
check_mars_device($cmd, $res, 1, 1);
return;
}
my $ok = detect_splitbrain($res, 1);
if (!$ok) {
lwarn "Sorry, in split brain situations I can only set the _designated_\n";
lwarn "primary, but I cannot _guarantee_ that becoming the\n";
lwarn "the actual primary is possible.\n";
lwarn "You SHOULD resolve the split brain ASAP (e.g. by leave-resource\n";
lwarn "or invalidate etc).\n";
return;
}
check_mars_device($cmd, $res, 1, 0);
}
sub wait_umount_res {
my ($cmd, $res) = @_;
while (1) {
my $sum = 0;
foreach my $path (glob("$mars/resource-$res/actual-*/open-count")) {
$sum += get_link($path);
}
last if !$sum;
lprint "device for resource '$res' is $sum times in use somewhere\n";
sleep_timeout(3);
}
lprint "OK, device for resource '$res' is not in use.\n";
}
sub invalidate_res_phase0 {
my ($cmd, $res) = @_;
check_not_primary(@_);
my $primary = _get_designated_primary($res);
ldie "for operation '$cmd', some other designated primary must exist (currently there is none)\n" if $primary eq "(none)";
}
sub invalidate_res_phase1 {
my ($cmd, $res) = @_;
_switch($cmd, $res, "$mars/resource-$res/todo-$host/sync", 0);
_switch($cmd, $res, "$mars/resource-$res/todo-$host/allow-replay", 0);
}
sub invalidate_res_phase2 {
my ($cmd, $res) = @_;
if (!$force) {
check_status($cmd, $res, "is-syncing", 0, 1);
check_status($cmd, $res, "is-replaying", 0, 1);
}
}
sub invalidate_res_phase3 {
my ($cmd, $res) = @_;
my $dst = "$mars/resource-$res/syncstatus-$host";
my $primary = _get_designated_primary($res);
ldie "Cannot execute 'invalidate' because noone is designated as primary.\n" if (!$primary || $primary eq "(none)");
my $replay = get_link("$mars/resource-$res/replay-$primary");
$replay =~ m/^log-([0-9]+)-/ or ldie "replay link '$replay' is not parsable\n";
my $replay_nr = $1;
set_link("0", $dst);
finish_links(); # opportunity for errors => don't continue
_set_replaylink("$mars/resource-$res", $replay_nr, $primary, "");
_switch($cmd, $res, "$mars/resource-$res/todo-$host/allow-replay", 1);
_switch($cmd, $res, "$mars/resource-$res/todo-$host/sync", 1);
}
sub resize_res {
my ($cmd, $res, $size_arg) = @_;
my $new_size = 0;
if ($size_arg) {
$new_size = get_size($size_arg);
ldie "optional size argument '$new_size' must be numeric and positive\n" unless scalar($new_size) > 0;
lprint "new size: $new_size bytes\n";
}
check_primary(@_);
my @actsizes = glob("$mars/resource-$res/actsize-*");
ldie "resource $res has no actsize-* symlinks\n" unless @actsizes;
my $lnk = "$mars/resource-$res/size";
my $old_size = get_link($lnk);
my $min_size = 0;
foreach my $actsize (@actsizes) {
my $this_size = get_link($actsize);
if (!$min_size || $this_size < $min_size) {
$min_size = $this_size;
}
}
lprint "old_size=$old_size\n";
lprint "min_size=$min_size\n";
$new_size = $min_size if !$new_size;
lprint "new_size=$new_size\n";
ldie "new size $new_size is higher than the minimum size of all volumes $min_size" if $new_size > $min_size; # no override with --force possible
# for now, disallow decreasing until some bugs are fixed
ldie "only increases of the size are possible!\n" if $new_size < $old_size;
ldie "only increases of the size are possible without --force\n" if $new_size <= $old_size && !$force;
foreach my $switch (glob("$mars/resource-$res/todo-*/sync")) {
my $this_switch = get_link($switch);
ldie "sync on '$switch' is switched on -- use marsadm pause-sync to stop\n" unless !$this_switch;
}
my @syncsizes = glob("$mars/resource-$res/syncstatus-$host");
foreach my $syncsize (@syncsizes) {
my $this_size = get_link($syncsize);
ldie "sync on $syncsize has not yet finished: $this_size != $old_size (DANGEROUS FIX: if you know what you are doing, marsadm fake-sync can 'fix' it -- but this may need a full-sync afterwards)\n" unless $this_size == $old_size;
}
foreach my $syncsize (@syncsizes) {
my $this_size = get_link($syncsize);
set_link($new_size, $syncsize);
}
set_link($new_size, $lnk);
}
sub role_cmd {
my ($cmd, $res) = @_;
my $primary = _get_actual_primary($res) || '(none)';
my $todo_primary = _get_designated_primary($res);
my $msg = "I am actually ";
$msg .= ($primary eq $host) ? "primary" : "secondary";
if ($primary eq $todo_primary) {
$msg .= " and $primary is primary" if ($primary ne $host);
}
elsif ($primary ne $todo_primary) {
$todo_primary = "I" if ($todo_primary eq $host);
$msg .= " and $todo_primary should be primary";
}
lprint $msg . "\n";
}
sub mars_state_cmd {
my ($cmd, $res) = @_;
my $primary = _get_actual_primary($res) || '(none)';
my $todo_primary = _get_designated_primary($res);
if ($primary eq $host) {
lprint "is_primary\n";
return;
}
elsif ($todo_primary eq $host) {
lprint "becoming_primary\n";
return;
}
# secondary without ambitions to become primary
my $size = get_link("$mars/resource-$res/size");
my $syncstatus = get_link("$mars/resource-$res/syncstatus-$host");
if ($syncstatus != $size) {
lprint "secondary inconsistent ($syncstatus bytes of $size)\n";
return;
}
if ($primary eq "(none)") {
my $min = 0;
foreach my $path (glob("$mars/resource-$res/log-*")) {
my $nr = $path;
$nr =~ s:^.*[a-z]+-([0-9]+)(-[^/]*)?$:$1:;
if ($nr > $min) {
$primary = $path;
$primary =~ s:^.*/[a-z]+-[0-9]+-([^/]*)$:$1:;
$min = $nr;
}
}
}
my $primary_replay = get_link("$mars/resource-$res/replay-$primary");
my $host_replay = get_link("$mars/resource-$res/replay-$host");
if ($primary_replay eq $host_replay) {
lprint "secondary uptodate\n";
return;
}
lprint "secondary outdated ($host_replay instead of $primary_replay)\n";
}
sub cat_cmd {
my $cmd = shift;
foreach my $path (@_) {
_get_text($path, undef, 1);
}
}
sub mars_info_cmd {
my ($cmd, $res) = @_;
my $info = "$mars/resource-$res/logstatus-$host.status";
cat_cmd($cmd, $info);
}
sub show_cmd {
my ($cmd, $res) = @_;
$res = "*" if !$res || $res eq "all";
my $glob = "$mars/{ips/ip-$host,alive-$host,emergency-$host,rest-space-$host,resource-$res/{device,primary,size,actsize-$host,syncstatus-$host,replay-$host,actual-$host/*,todo-$host/*}}";
foreach my $link (glob($glob)) {
next unless -l $link;
my $res = get_link($link);
my $short = $link;
$short =~ s:^$mars/::;
lprint "$short=$res\n";
}
}
sub show_errors_cmd {
my ($cmd, $res) = @_;
my $text = get_error_text(@_);
if ($text) {
lprint $text;
ldie "resource $res has some (old) problems.\n";
} else {
lprint "no errors/warnings are reported.\n";
}
}
sub helplist {
my $temp;
$temp = shift;
lprint "ERROR: $temp" if ($temp);
lprint "
marsadm [<global_option>] <command> [<resource>] [<option>]
<global_option> =
--force : skip safety checks
use only when you know what you are doing!
--timeout=<seconds> : leave safety checks after timeout with an error
instead of waiting forever
<command> =
cluster : create-cluster join-cluster leave-cluster
resource : create-resource join-resource leave-resource
primary secondary invalidate
up down
replication: connect disconnect connect-local disconnect-local
replay : pause-replay resume-replay pause-replay-local resume-replay-local
sync : pause-sync resume-sync pause-sync-local resume-sync-local fake-sync
logfile : log-rotate log-delete log-delete-all
device : attach detach
internal : help version
<resource> = name of resource or \"all\" for all resources
<option> = special by command
\n";
exit 0;
}
sub version {
lprint "$0 $Id\n";
#lprint "my IP is $ip\n";
exit 0;
}
##################################################################
my %cmd_table =
(
# new keywords
"create-cluster" => \&create_cluster,
"join-cluster" => \&join_cluster,
"leave-cluster" => \&leave_cluster,
"create-resource" => \&create_res,
"join-resource" => \&create_res,
"leave-resource" => [
"check preconditions", \&leave_res_phase0,
"switch state", \&leave_res_phase1,
"wait for deletions", \&leave_res_phase2,
],
"delete-resource" => \&delete_res,
"log-rotate" => \&logrotate_res,
"log-delete" => \&logdelete_res,
"log-delete-all" => \&logdelete_res,
"log-purge-all" => \&log_purge_res,
"fake-sync" => \&fake_local_res,
"set-link" => \&set_link_cmd,
"delete-file" => \&delete_file_cmd,
"cat" => \&cat_cmd,
"show" => \&show_cmd,
"show-errors" => \&show_errors_cmd,
"show-state" => \&mars_state_cmd,
"mars-state" => \&mars_state_cmd, # deprecated
"show-info" => \&mars_info_cmd,
"mars-info" => \&mars_info_cmd, # deprecated
"pause-replay-local" => \&pause_replay_local_res,
"pause-replay-global" => \&pause_replay_global_res,
"pause-replay" => \&pause_replay_local_res,
"resume-replay-local" => \&pause_replay_local_res,
"resume-replay-global"=> \&pause_replay_global_res,
"resume-replay" => \&pause_replay_local_res,
"set-replay" => \&set_replay_res,
"wait-umount" => \&wait_umount_res,
"wait-cluster" => \&wait_cluster,
"wait-resource" => \&wait_cluster,
# compatible keywords (or their derivatives)
"attach" => [
"check preconditions", \&attach_res_phase0,
"switch state", \&attach_res_phase1,
"wait for effect", \&attach_res_phase2,
],
"detach" => [
"check preconditions", \&attach_res_phase0,
"switch state", \&attach_res_phase1,
"wait for effect", \&attach_res_phase2,
],
"connect-local" => \&connect_local_res,
"connect-global" => \&connect_global_res,
"connect" => \&connect_local_res,
"disconnect-local" => \&connect_local_res,
"disconnect-global" => \&connect_global_res,
"disconnect" => \&connect_local_res,
"syncer" => \&ignore_cmd,
"up" => [
"check preconditions", \&up_res_phase0,
"switch state", \&up_res_phase1,
"wait for effect", \&up_res_phase2,
],
"down" => [
"check preconditions", \&up_res_phase0,
"switch state", \&up_res_phase1,
"wait for effect", \&up_res_phase2,
],
"primary" => [
"check preconditions", \&primary_phase0,
"leave primary state", \&primary_phase1,
"wait when necessary", \&primary_phase2,
"switch to primary", \&primary_phase3,
"wait for device", \&primary_phase4,
],
"secondary" => [
"check preconditions", \&primary_phase0,
"leave primary state", \&primary_phase1,
"wait for effect", \&primary_phase4,
],
"invalidate" => [
"check preconditions", \&invalidate_res_phase0,
"stop old replay", \&invalidate_res_phase1,
"wait for replay off", \&invalidate_res_phase2,
"force symlinks", \&invalidate_res_phase3,
],
"invalidate-remote" => \&forbidden_cmd,
"resize" => \&resize_res,
"create-md" => \&senseless_cmd,
"get-gi" => \&ignore_cmd,
"show-gi" => \&ignore_cmd,
"dump-md" => \&senseless_cmd,
"outdate" => \&ignore_cmd,
"adjust" => \&ignore_cmd,
"wait-connect" => \&wait_cluster,
"role" => \&role_cmd,
"state" => \&role_cmd,
"cstate" => \&nyi_cmd,
"dstate" => \&nyi_cmd,
"status" => \&nyi_cmd,
"dump" => \&senseless_cmd,
"verify" => \&forbidden_cmd,
"pause-sync-local" => \&pause_sync_local_res,
"pause-sync-global" => \&pause_sync_global_res,
"pause-sync" => \&pause_sync_local_res,
"resume-sync-local" => \&pause_sync_local_res,
"resume-sync-global"=> \&pause_sync_global_res,
"resume-sync" => \&pause_sync_local_res,
"new-current-uuid" => \&senseless_cmd,
"hidden-commands" => \&ignore_cmd,
);
my @args;
foreach my $arg (@ARGV) {
if ($arg eq "--force" || $arg eq "-f") {
$force++;
next;
} elsif ($arg eq "--dry-run" || $arg eq "-d") {
$dry_run++;
next;
} elsif ($arg eq "--verbose" || $arg eq "-v") {
$verbose++;
next;
} elsif ($arg =~ s/--timeout\s*=\s*([0-9]+)/$1/) {
$timeout = $arg;
next;
} elsif ($arg =~ s/--host\s*=\s*([-_A-Za-z0-9]+)/$1/) {
ldie "host '$arg' does not exist in /mars/ips/ip-*\n" unless -l "/mars/ips/ip-$arg";
if ($arg ne $host) {
lprint "ATTENTION: acting as if I were host '$arg'\n";
lwarn "some commands require local knowledge not available here.\n";
lwarn "thus something may fail or go wrong - use this at your risk!\n";
$host = $arg;
}
next;
} elsif ($arg =~ s/--ip\s*=\s*([0-9.:\[\]]+)/$1/) {
$ip = $arg;
lprint_stderr "Using IP '$ip' from command line.\n";
next;
}
if ($arg =~ s/^force-//) {
$force++;
}
push @args, $arg;
}
my $cmd = shift @args || helplist "command argument is missing\n";
if (!$ip) {
$ip = _get_ip() or ldie "cannot determine my IP address\n";
}
$notify = "(cmd: $cmd)" unless $cmd eq "version";
if ($cmd =~ m/^help$/ || $cmd =~ m/^h$/) {
helplist;
}
if ($cmd =~ m/^version$/ || $cmd =~ m/^v$/) {
version;
}
ldie "only root may use this tool\n" if $< != 0 && $cmd !~ m/^cat$/; # getpid() seems to be missing in perlfunc
helplist "unknown command $cmd\n" if !exists $cmd_table{$cmd};
if (!(-d $mars) && $cmd !~ m/(create|join)-cluster|cat/) {
ldie "The $mars directory does not exist.\n";
}
my $res = "";
if ($cmd eq "show") {
$res = shift @args;
} elsif (!($cmd =~ m/^(create|leave|wait)-cluster|cat|[a-z]+-file|set-[a-z]+$/)) {
$res = shift @args || helplist "resource argument is missing\n";
check_id($res);
}
lwarn "Using FORCE option -- hopefully you know what you are doing!\n" if $force;
my %checked_res;
sub do_one_res {
my $func = shift;
my ($cmd, $res) = @_;
if (!$checked_res{"$cmd$res"}) {
$res = check_res($res) unless $cmd =~ m/^(join|create|leave|wait)-cluster|create-resource|show|cat|[a-z]+-file|set-link$/;
check_res_member($res) unless $cmd =~ m/^(join|create|delete)-(cluster|resource)|(leave|wait)-cluster|show|cat|[a-z]+-file|set-link$/;
detect_splitbrain($res, 1);
$checked_res{"$cmd$res"} = 1;
}
&{$func}(@_);
}
my %skip_res;
sub do_all_res {
my $func = shift;
my $cmd = shift;
my $res = shift || "all";
if ($res eq "all" && $cmd !~ m/show|cat|cluster|set-link|delete-file/) {
ldie "For safety reasons, --force is only allowed on explicitly named resources. Combination of 'all' with --force is disallowed!\n" if $force;
ldie "Cannot combine command '$cmd' with 'all' existing resources - you must explicitly name a single new resource\n" if $cmd =~ m/create|join/;
my $any_success = 0;
my $any_member = 0;
foreach $res (glob("$mars/resource-*")) {
next unless -e "$res/data-$host";
$any_member++;
$res =~ s/^.*\/resource-(.*)$/$1/;
next if defined($skip_res{$res});
lprint "--------- resource $res\n";
eval {
do_one_res($func, $cmd, $res, @_);
1;
} and $any_success = 1 or $skip_res{$res} = 1;
}
if (!$any_success) {
if (!$any_member) {
lprint "I am not member of any resource\n";
return 1;
}
ldie "all resources have errors\n";
}
return !$any_success;
} else {
return do_one_res($func, $cmd, $res, @_);
}
}
my $func = $cmd_table{$cmd};
ldie "unknown command '$cmd'\n" unless $func;
if (ref($func) eq "ARRAY") {
my @list = @$func;
while (@list) {
my $headline = shift @list;
my $memb_func = shift @list;
lprint "---------------------------- $headline:\n";
my $status = do_all_res($memb_func, $cmd, $res, @args);
last if (defined($status) && $status);
finish_links();
}
} elsif (ref($func) eq "CODE") {
do_all_res($func, $cmd, $res, @args);
} else {
ldie "internal error: command table is wrong for '$cmd'";
}
finish_links();
exit($error_count);