mirror of
synced 2025-02-20 22:36:57 +00:00
1379 lines
43 KiB
Executable File
1379 lines
43 KiB
Executable File
#!/usr/bin/perl -w
# (c) 2010 Thomas Schoebel-Theuer / 1&1 Internet AG
use strict;
use English;
use warnings;
umask 0077;
# messaging
my $warn = "";
my $notify = "";
sub lprint {
my ($text) = @_;
print "$warn$text";
if ($notify) {
system("/usr/bin/logger -t marsadm \"$warn$notify $text\"");
sub ldie {
my ($text) = @_;
$warn = "DYING: ";
lprint $text;
exit -1;
sub lwarn {
my ($text) = @_;
my $oldwarn = $warn;
$warn = "WARNING: ";
lprint $text;
$warn = $oldwarn;
# low-level infrastructure
sub get_link {
my ($path, $unchecked) = @_;
my $result = readlink($path);
if (!defined($result)) {
ldie "cannot read symlink '$path'\n" unless $unchecked;
lwarn "cannot read symlink '$path'\n" if $unchecked == 1;
return $result;
# global variables and checks
my $Id = '$Id$ ';
my $user_version = 0.1;
my $mars = "/mars";
my $host = `uname -n` or ldie "cannot determine my network node name\n";
chomp $host;
my $force = 0;
my $timeout = -1;
my $ip = _get_ip() or ldie "cannot determine my IP address\n";
if (! -d $mars) {
ldie "The $mars directory does not exist.\n";
if (! $1 =~ /cluster/) {
my $kernel_version = get_link("$mars/tree-$host", 1);
if ($kernel_version && $user_version < $kernel_version) {
ldie "Sorry, your MARS kernel module uses version $kernel_version, but my $0 userspace version is only $user_version. That cannot work. Please upgrade your userspace scripts!\n";
# timeout handling
# return the lamport clock time in nanosecond resolution
# fallback to system time()
sub mars_time {
open(my $lamport_clock, "<", "/proc/sys/mars/lamport_clock");
my $lamport_time;
while (<$lamport_clock>) {
$lamport_time = $1 if /^lamport_now=(.*)/;
return $lamport_time || time() . '0' x 9;
sub sleep_timeout {
my $sleeptime = shift || 5;
if ($timeout < 0) {
ldie "Timeout reached. You may retry with --timeout=-1 to ensure waiting until progress is possible.\n" if !$timeout;
my $rest = $timeout;
$rest = $sleeptime if $rest > $sleeptime;
$timeout -= $rest;
# syntactic checks
sub check_id {
my $str = shift;
ldie "identifier '$str' has disallowed characters" unless $str =~ m/^[A-Za-z_][-A-Za-z0-9_]*$/;
ldie "identifier '$str' is too long (only 16 chars allowed)" if length($str) > 16;
# semantic checks
sub check_res {
my $res = shift;
if (not -d "$mars/resource-$res") {
# DO WHAT I MEAN: try to substitute a device name for a badly given resource name if it is unique
my $count = 0;
my $found;
my @tests = glob("$mars/resource-*/device-$host");
foreach my $test (@tests) {
my $target = get_link($test, 2);
if ($target eq $res) {
$found = $test;
if (!$count) {
@tests = glob("$mars/resource-*/_direct-*-$host");
foreach my $test (@tests) {
my $target = get_link($test, 2);
$target =~ s/^.*,//;
if ($target eq $res) {
$found = $test;
ldie "resource '$res' does not exist ($count replacements found)\n" unless $count == 1 and $found;
$found =~ s:^.*/resource-(.*)/.*$:$1:;
lwarn "substituting bad resource name '$res' by uniquely matching resource name '$found'\n";
$res = $found;
return $res;
sub check_sizes {
my ($res, $host) = @_;
my $logical_size = get_link("$mars/resource-$res/size");
my $physical_size = get_link("$mars/resource-$res/actsize-$host", 1);
if (defined($physical_size) && $physical_size < $logical_size) {
ldie "physical device on host '$host' has size $physical_size, which is smaller than the logical resource size $logical_size\n";
sub check_res_member {
my $res = shift;
ldie "sorry, I have not yet joined to resource '$res'\n" unless -e "$mars/resource-$res/data-$host";
check_sizes($res, $host);
sub check_sync_finished {
my ($res, $host) = @_;
my $lnk = "$mars/resource-$res/syncstatus-$host";
if (lstat($lnk)) {
my $syncstatus = get_link($lnk, 1);
my $size = get_link("$mars/resource-$res/size");
ldie "sync has not yet finished, only $syncstatus / $size bytes transferred\n" unless $syncstatus >= $size;
lprint "OK, it seems that sync has finished on $host.\n";
sub check_primary {
my ($cmd, $res) = @_;
my $lnk = "$mars/resource-$res/actual-$host/is-primary";
my $is_primary = get_link($lnk);
ldie "for operation '$cmd' I need to be primary\n" unless $is_primary;
my $primary = _get_designated_primary($res);
ldie "for operation '$cmd', I also must be the designated primary\n" unless $primary eq $host;
sub check_not_primary {
my ($cmd, $res) = @_;
my $lnk = "$mars/resource-$res/actual-$host/is-primary";
my $is_primary = get_link($lnk);
if ($is_primary) {
ldie "operation '$cmd' cannot be executed on primary\n";
# also check whether we intend to become primary
my $primary = _get_designated_primary($res);
ldie "operation '$cmd' cannot be executed on designated primary\n" if $primary eq $host;
sub check_primary_gone {
my ($res) = @_;
for (;;) {
my $pri = _get_actual_primary($res);
last if !$pri;
lprint "waiting for other primary host ($pri) to disappear....\n";
sub check_primary_settled {
my ($res) = @_;
my $target_primary = _get_designated_primary($res);
for (;;) {
my $actual_primary = _get_actual_primary($res) || '(none)';
last if ($target_primary eq $actual_primary);
lprint "waiting for primary $target_primary to be settled\n";
sub check_todo {
my ($cmd, $res, $key, $val, $wait, $unchecked, $inv) = @_;
my $path = "$mars/resource-$res/todo-$host/$key";
my $link;
for (;;) {
$link = get_link($path, $unchecked);
return unless defined($link);
if (defined($inv) && $inv) {
last if $link != $val;
ldie "cannot execute $cmd: switch '$key' must not have value '$val'\n" if !$wait;
lprint "waiting until switch '$key' leaves the value '$val'....\n";
} else {
last if $link == $val;
ldie "cannot execute $cmd: switch '$key' must have value '$val', but actually has value '$link'\n" if !$wait;
lprint "waiting until switch '$key' reaches the value '$val'....\n";
lprint "OK, '$path' has acceptable value '$link'\n";
sub check_status {
my ($cmd, $res, $key, $val, $wait, $unchecked, $inv) = @_;
my $path = "$mars/resource-$res/actual-$host/$key";
my $link;
for (;;) {
$link = get_link($path, $unchecked);
return unless defined($link);
if (defined($inv) && $inv) {
last if $link != $val;
ldie "cannot execute $cmd: '$path' must not have value '$val'\n" if !$wait;
lprint "waiting until '$key' leaves the value '$val'...\n";
} else {
last if $link == $val;
ldie "cannot execute $cmd: '$path' must have value '$val'\n" if !$wait;
lprint "waiting until '$key' reaches the value '$val'...\n";
lprint "OK, '$path' has acceptable value '$link'\n";
# Check if modification time of file or symlink is older than age
# Returns 1 if modification time is not $age seconds older than a
# reference time, either lamport clock for symlinks or system
# time for regular files. Return value 0 means "file is new enough".
# Returns 2 if modification time or reference time cannot be determined
# Note: symlinks are created/updated with lamport clock timestamps
# while normal files get timestamps from system time
sub _check_file_aged {
my ($path, $age) = @_;
my $mtime;
my $reftime; # reference time ("now")
if (-l $path) {
$mtime = (lstat($path))[9];
$reftime = int(mars_time()); # discards sub-second resolution
elsif (-f $path) {
$mtime = (stat($path))[9];
$reftime = time();
return 2 if (!$mtime or !$reftime); # error -> file is aged
return ($mtime < $reftime - $age) ? 1 : 0;
# Check if any file below $path was modified in the last $age seconds
sub _check_files_modified_any_of {
my ($path, $age) = @_;
my @list = glob($path);
my $res = 1;
foreach my $p (@list) {
if (!_check_file_aged($p, $age)) {
$res = 0;
return $res;
sub _get_minmax {
my ($res, $glob, $take_symlink) = @_;
my $min = -1;
my $max = -1;
my @paths = glob($glob) or ldie "cannot find '$glob'\n";
foreach my $path (@paths) {
my $nr = $path;
if ($take_symlink) {
$nr = get_link($path);
$nr =~ s:^.*[a-z]+-([0-9]+)(-[^/]*)?$:$1:;
$min = $nr if ($nr < $min || $min < 0);
$max = $nr if ($nr > $max || $max < 0);
return ($min, $max);
sub get_minmax_logfiles {
my ($res) = @_;
return _get_minmax($res, "$mars/resource-$res/log-*", 0);
sub get_minmax_versions {
my ($res) = @_;
return _get_minmax($res, "$mars/resource-$res/version-*", 0);
sub get_minmax_any {
my ($res) = @_;
return _get_minmax($res, "$mars/resource-$res/{log,version}-*", 0);
sub get_minmax_replays {
my ($res) = @_;
return _get_minmax($res, "$mars/resource-$res/replay-*", 1);
sub check_splitbrain {
# check only the chain of $host (or all hosts if unset)
# check up to $sequence (or for all if < 0)
my ($res, $host, $sequence) = @_;
if ($sequence < 0) {
my $old = _get_actual_primary($res) || "(none)";
_primary_res($res, "(none)", $old) unless $old eq "(none)";
while (!_check_files_modified_any_of("$mars/resource-$res/{log,version,replay}-*", 60)) {
lprint "resource directory $res not stable, waiting....\n";
while (1) {
my ($min_log, $max_log) = get_minmax_logfiles($res);
my ($min_ver, $max_ver) = get_minmax_versions($res);
my ($min_rep, $max_rep) = get_minmax_replays($res);
if ($min_ver > $min_log || $max_ver < $max_log) {
lprint "some version links are missing...\n";
if ($max_log >= $max_rep) {
lprint "resource $res: logfile $max_log is present.\n";
lprint "resource $res: logfile $max_log is not yet transferred (need $max_rep), waiting....\n";
} # $sequence < 0
my $glob = "$mars/resource-$res/version-[0-9]*";
my @links = glob($glob);
if (!@links) {
@links = glob("$mars/resource-$res/version-[0-9]*-*");
ldie "no version information available\n" unless @links;
lprint "assuming that I am primary for the first time\n";
@links = sort(@links);
foreach my $link (@links) {
my $nr = $link;
$nr =~ s:^.*[a-z]+-([0-9]+)(-[^/]*)?$:$1:;
$nr = int($nr);
next if ($sequence >= 0 && $nr > $sequence);
my $fromhost = $link;
$fromhost =~ s:^.*version-[0-9]*-(.*)$:$1:;
my $version = get_link($link);
my $otherhost = $version;
$otherhost =~ s/^[^,]*?,(?:log-[0-9]*?-|)([^,]*?),.*$/$1/;
my $otherlink = sprintf("$mars/resource-$res/version-%09d-$otherhost", $nr);
my $otherversion = get_link($otherlink);
# ignore foreign mismatches
if ($host) {
next if $fromhost ne $host;
# by defintion, the originator of a logfile is always "right"
next if $otherhost eq $fromhost;
# final check
ldie "splitbrain at sequence $nr detected\n" unless $version eq $otherversion;
sub get_size {
my $arg = shift || "";
if (!($arg =~ m/^([0-9]+(?:\.[0-9]*)?)([kmgtp]?)$/i)) {
ldie "size argument '$arg' must be a number, optionally followed by suffix k or m or g or t or p\n";
my $mod = $2 || "";
$arg = $1;
$_ = $mod;
/k/i and $arg *= 1024, last SWITCH;
/m/i and $arg *= 1024 * 1024, last SWITCH;
/g/i and $arg *= 1024 * 1024 * 1024, last SWITCH;
/t/i and $arg *= 1024 * 1024 * 1024 * 1024, last SWITCH;
/p/i and $arg *= 1024 * 1024 * 1024 * 1024 * 1024, last SWITCH;
ldie "size '$arg' is not a multiple of 4k\n" if ($arg % 4096) != 0;
return $arg;
# Get actual primary node from links below actual-*/ subdirs
sub _get_actual_primary {
my ($res) = @_;
my @primary_links = glob("$mars/resource-$res/actual-*/is-primary");
my $primary;
foreach my $link (@primary_links) {
if (my $val = get_link($link)) {
$primary = ($link =~ qr%.*actual-([^/]+)/is-primary%)[0];
# Note: if there are more than one 'is-primary' links (an insane state anyway),
# the first 'is-primary' link is selected. Other links are ignored.
return $primary;
sub _get_designated_primary {
my ($res) = @_;
return get_link("$mars/resource-$res/primary");
sub get_peers {
my ($res) = @_;
my @list = glob("$mars/resource-$res/connect-*");
return map { $_ =~ s:$mars/resource-$res/connect-::; $_ } @list;
# helpers
sub _trigger {
system("(echo 1 > /proc/sys/mars/trigger) >/dev/null 2>&1");
sub _switch {
my ($cmd, $res, $path, $on) = @_;
my $src = $on ? "1" : "0";
my $old = get_link($path);
if ($old && $old eq $src) {
lprint "${cmd} on resource $res is already activated\n" if $cmd;
my $tmp = $path;
$tmp =~ s/\/([^\/]+)$/.tmp.$1/;
symlink($src, $tmp) or ldie "cannot create switch symlink\n";
rename($tmp, $path) or ldie "cannot rename switch symlink\n";
lprint "successfully started ${cmd} on resource $res\n" if $cmd;
sub _writable {
my ($path, $on) = @_;
my $oldmode = (lstat $path)[2] & 0700;
my $newmode = $on ? $oldmode | 0200 : $oldmode & ~0200;
lprint "chmod '$path' $oldmode $newmode";
chmod($newmode, $path) == 1 or ldie "cannot chmod '$path'\n";
sub _get_ip {
chomp (my @info = `/sbin/ip addr show dev eth0`);
foreach (@info) {
m#\sinet\s(\d+\.\d+\.\d+\.\d+)# && return $1;
return undef;
# Which primary was active in the past when logfile number $log_nr was created?
# In general, this may be even a node which doesn't exist anymore.
# $supposed_primary should be normally empty, but may be used
# to give a hint and check for consistency of ancient knowledge.
sub _get_former_primary {
my ($basedir, $log_nr, $supposed_primary) = @_;
my $primary = $supposed_primary;
foreach my $type ("log", "version") {
my $base_path = sprintf("$basedir/$type-%09d-", $log_nr);
my $pri_path = "$base_path$supposed_primary";
my $log_path = "$base_path*";
my @names = glob($log_path);
if (!@names) {
if ($type eq "log" and !$supposed_primary) {
ldie "Sorry, there exist no names '$log_path'\n" unless $supposed_primary;
} elsif (scalar(@names) == 1) {
my $found = $names[0];
if ($supposed_primary) {
ldie "Sorry, '$pri_path' does not exist, although '$found' would exist.\n" unless $pri_path eq $found;
} else { # assume that the found name is the right one.
$pri_path = $found;
lprint "found '$pri_path'\n";
ldie "found name is malformed\n" unless $pri_path =~ m:^$base_path(.*):;
$primary = $1;
} else { # multiple exist...
lprint "There are multiple names with number $log_nr.\n";
my $present = 0;
foreach my $file (@names) {
lprint " $file\n";
$present++ if $file eq $pri_path;
if ($type eq "log") {
lwarn "Usually, this is an indication for split-brain.\n";
lwarn "Be careful!\n";
ldie "Cannot select between them -- no primary preference given.\n" unless $supposed_primary;
ldie "Sorry, '$pri_path' is not among them.\n" unless $present;
lprint "=> using '$pri_path' out of them\n";
return $primary;
ldie "could not determine old primary site for logfile version number $log_nr\n";
sub _fake_versionlink {
my ($basedir, $log_nr, $primary) = @_;
$primary = _get_former_primary(@_) unless $primary;
my $new_version = sprintf("$basedir/version-%09d-$host", $log_nr);
my $pri_version = sprintf("$basedir/version-%09d-$primary", $log_nr);
if ($primary eq $host) {
lwarn "it makes no sense to fake my own version link '$new_version'\n";
my $pri_link = get_link($pri_version);
if (!$pri_link) { # try any one else
lwarn "cannot read symlink '$pri_version' -- trying a neighbor link instead\n";
my $try_version = sprintf("$basedir/version-%09d-*", $log_nr);
my @test = glob($try_version);
my $test_version = shift @test;
if ($test_version) {
lwarn "trying substitute symlink '$test_version'\n";
my $test_link = get_link($test_version);
if ($test_link) {
$pri_link = $test_link;
lwarn "got value '$pri_link', hopefully this is right\n" if $pri_link;
if ($pri_link) {
lprint "creating new version symlink '$new_version' -> '$pri_link'\n";
system("rm -f $new_version.tmp");
symlink($pri_link, "$new_version.tmp") or ldie "cannot create faked version symlink '$new_version'\n";
system("mv $new_version.tmp $new_version");
} else {
lwarn "cannot read symlink '$pri_version' -- cannot create faked versionlink '$pri_version'\n";
sub _set_replaylink {
my ($basedir, $log_nr) = @_;
my $primary = _get_former_primary(@_);
my $rep_path = "$basedir/replay-$host";
my $rep_val = sprintf("log-%09d-$primary,0,0", $log_nr);
lprint "creating new replaylink '$rep_path' -> '$rep_val'\n";
system("rm -f $rep_path.tmp");
symlink($rep_val, "$rep_path.tmp") or ldie "cannot create symlink '$rep_path'\n";
system("mv $rep_path.tmp $rep_path");
if ($log_nr > 1) {
my $old_primary = "";
my $vers_link = sprintf("$basedir/version-%09d-$primary", $log_nr);
my $vers_val = get_link($vers_link);
$old_primary = $1 if $vers_val =~ m/:.*,log-[0-9]+-([^,]+),/;
_fake_versionlink($basedir, $log_nr - 1, $old_primary);
# commands
sub ignore_cmd {
my ($cmd, $res) = @_;
lprint "ignoring command '$cmd' on resource '$res'\n";
sub senseless_cmd {
my ($cmd, $res) = @_;
lprint "command '$cmd' makes no sense with MARS (ignoring)\n";
sub forbidden_cmd {
my ($cmd, $res) = @_;
ldie "command '$cmd' cannot be used with MARS (it is impossible to carry out uniquely and could therefore lead to a disaster)\n";
sub nyi_cmd {
my ($cmd, $res) = @_;
ldie "command '$cmd' is not yet implemented\n";
sub is_module_loaded {
open TEST, "lsmod | grep mars |";
my $res = <TEST>;
close TEST;
return $res;
sub _create_cluster {
my ($cmd) = @_;
system("mkdir $mars") unless -d $mars;
system("mkdir $mars/ips") unless -d "$mars/ips";
system("mkdir $mars/userspace") unless -d "$mars/userspace";
system("mkdir $mars/defaults") unless -d "$mars/defaults";
system("mkdir $mars/defaults-$host") unless -d "$mars/defaults-$host";
system("mkdir $mars/todo-global") unless -d "$mars/todo-global";
symlink($ip, "$mars/ips/ip-$host");
symlink("1", "$mars/todo-global/deleted-$host");
sub create_cluster {
my ($cmd, $peer) = @_;
ldie "cluster is already created\n" if !$force && -d "$mars/ips";
ldie "mars module is loaded, please unload first\n" if is_module_loaded();
sub join_cluster {
my ($cmd, $peer) = @_;
if (glob("$mars/resource-*") or glob("$mars/ips/*")) {
ldie "Sorry, some resources already exist!\nThis is dangerous!\nIf you are sure that no resource clash is possible, re-invoke this command with '--force' option\n" unless $force;
ldie "mars module is loaded, please unload first\n" if is_module_loaded();
lprint "joining cluster via rsync (peer='$peer')\n";
# check connection
system("ssh $peer uname -a") == 0 or ldie "oops, no connection to $peer ...\n";
system("rsync --recursive --links -v $peer:$mars/ips/ $mars/ips/") == 0 or ldie "oops\n";
symlink($ip, "$mars/ips/ip-$host");
system("rsync --recursive --links -v $mars/ips/ $peer:$mars/ips/") == 0 or ldie "oops\n";
sub leave_cluster {
my ($cmd) = @_;
my $check = "/mars/resource-*/*-$host";
ldie "I am member of some resources\n" if glob($check) && !$force;
sub create_res {
my ($cmd, $res, $dev, $appear) = @_;
my $create = ($cmd eq "create-resource");
ldie "undefined device or size argument\n" unless $dev;
$appear = $res if !$appear;
check_id($appear) if $appear;
if ($create) {
ldie "resource '$res' already exists\n" if -d "$mars/resource-$res";
lprint "creating new resource '$res'\n";
} else {
ldie "resource '$res' has been already joined -- this is dangerous!\n" if -e "$mars/resource-$res/connect-$host";
lprint "joining to existing resource '$res'\n";
my $size = get_size($dev);
if ($size > 0) {
$dev = "";
} else {
ldie "block device '$dev' does not exist\n" unless -b $dev;
ldie "block device '$dev' must be an absolute path starting with '/'\n" unless $dev =~ m/^\//;
use Fcntl 'SEEK_END';
open(TEST, "<$dev") or ldie "cannot open device for reading\n";
$size = sysseek(TEST, 0, SEEK_END);
lprint "device size = $size bytes\n";
ldie "implausible size $size" unless $size > 0;
my $tmp = "$mars/.tmp.$res";
my $primary;
my $replay_nr = -1;
if ($create) {
system("rm -rf $tmp");
system("mkdir $tmp") == 0 or ldie "could not create resource '$res'\n";
symlink($size, "$tmp/size") or ldie "cannot create size indicator symlink\n";
} else {
$tmp = "$mars/resource-$res";
ldie "resource '$res' does not exist\n" unless -d $tmp;
$primary = _get_designated_primary($res);
if ($primary eq "(none)") {
my @list = glob("$tmp/replay-*") or ldie "cannot find any candidate for primary\n";
my $first = pop @list or ldie "bad glob list\n";
$primary = get_link($first);
$primary =~ s/^log-[0-9]+-(.*),.*,.*$/$1/;
lprint "using '$primary' as primary\n";
ldie "resource '$res' is already joined\n" if -e "$tmp/data-$host";
ldie "my ip '$ip' is not registered -- please run 'join-cluster' first\n" unless -l "$mars/ips/ip-$host";
my $oldsize = get_link("$tmp/size");
if ($size < $oldsize) {
lprint "adjusting size to $oldsize\n";
$size = $oldsize;
ldie "sizes differ: real size = $oldsize, but requested size = $size\n" unless $oldsize == $size;
my $replay = get_link("$tmp/replay-$primary");
if ($replay =~ m/^log-([0-9]+)-/) {
$replay_nr = $1;
} else {
ldie "cannot determine current logfile number.\n";
my $file = "$tmp/data-$host";
if (!$dev) {
lprint "creating sparse file '$file' with size $size\n";
open(OUT, ">$file") or ldie "could not open '$file'\n";
use Fcntl 'SEEK_SET';
sysseek(OUT, $size-1, SEEK_SET) == $size-1 or ldie "could not seek\n";
syswrite(OUT, '\0', 1) == 1 or ldie "cannot init sparse file\n";
close OUT;
} else {
lprint "using existing device '$dev'\n";
symlink($dev, $file) or ldie "cannot create device symlink\n";
if ($appear) {
# TODO: check for uniqeness of $appear
lprint "resource '$res' will appear as local device '/dev/mars/$appear'\n";
system("rm -f $tmp/device-$host");
symlink($appear, "$tmp/device-$host") or ldie "cannot create symlink for local device appearance\n";
mkdir("$tmp/userspace") unless -d "$tmp/userspace";
mkdir("$tmp/defaults") unless -d "$tmp/defaults";
my $todo = "$tmp/todo-$host";
symlink("1", "$todo/attach");
symlink("1", "$todo/connect");
symlink("1", "$todo/sync");
symlink("1", "$todo/allow-replay");
system("rm -f $tmp/syncstatus-$host");
if ($create) {
symlink($host, "$tmp/primary") or ldie "cannot create primary symlink\n";
symlink($size, "$tmp/syncstatus-$host") or ldie "cannot create primary syncstatus\n";
symlink("log-000000001-$host,0,0", "$tmp/replay-$host") or ldie "cannot create replay status\n";
system("touch $tmp/log-000000001-$host");
rename($tmp, "$mars/resource-$res") or ldie "cannot finalize resource '$res'\n";
lprint "successfully created resource '$res'\n";
} else {
_set_replaylink($tmp, $replay_nr, $primary);
symlink("0", "$tmp/syncstatus-$host") or ldie "cannot start initial sync\n";
system("rm -f $tmp/connect-$host");
symlink($primary, "$tmp/connect-$host") or ldie "cannot create peer connect symlink\n";
symlink($host, "$tmp/connect-$primary") unless -l "$tmp/connect-$primary";
lprint "successfully joined resource '$res'\n";
sub leave_res {
my ($cmd, $res) = @_;
foreach my $tmp (glob("$mars/resource-$res/todo-$host/*")) {
my $status = get_link($tmp);
ldie "switch '$tmp' is not off\n" if $status;
foreach my $tmp (glob("$mars/resource-$res/actual-$host/*")) {
my $status = get_link($tmp);
ldie "running status '$tmp' is not off\n" if $status;
my $peerlink = "$mars/resource-$res/connect-$host";
my $peer = get_link($peerlink);
foreach my $tmp (glob("$mars/resource-$res/connect-*")) {
next if $tmp eq $peerlink;
my $target = get_link($tmp);
next unless $target eq $host;
lprint "changing '$tmp' from '$host' to '$peer'\n";
symlink($peer, "$tmp.new") or ldie "cannot create symlink '$tmp.new'\n";
rename("$tmp.new", $tmp) or ldie "cannot create symlink '$tmp'\n";
sub logrotate_res {
my ($cmd, $res) = @_;
my @paths = glob("$mars/resource-$res/log-*-$host") or ldie "cannot find any logfiles\n";
@paths = sort(@paths);
my $last = pop(@paths);
if (-z $last) {
lprint "an empty logfile '$last' already exists, nothing to do.\n";
my $nr = $last;
$nr =~ s/^.*log-([0-9]+)-.+$/$1/;
my $next = sprintf("$mars/resource-$res/log-%09d-$host", $nr + 1);
ldie "logfile '$next' already exists\n" if -e $next;
system("touch $next");
sub _get_deletable_logfiles {
my ($cmd, $res) = @_;
my $min = -1;
my $max = -1;
my @log_paths = glob("$mars/resource-$res/log-*") or ldie "cannot find any logfiles\n";
foreach my $path (@log_paths) {
$path =~ m/\/log-([0-9]+)-/;
my $nr = $1;
$min = $nr if ($nr < $min || $min < 0);
$max = $nr if ($nr > $max || $max < 0);
my @paths = glob("$mars/resource-$res/replay-*") or ldie "cannot find any replay symlinks\n";
foreach my $path (@paths) {
my $target = get_link($path);
$target =~ m/^log-([0-9]+)/;
my $nr = $1;
$max = $nr if ($nr < $max || $max < 0);
lprint "min deletable logfile number: $min\n";
lprint "max non-deletable logfile number: $max\n";
return ($min, $max);
sub _create_delete {
my ($target) = @_;
my $nr = 0;
my @paths = glob("$mars/todo-global/delete-*");
if (@paths) {
my $last = pop(@paths);
$nr = $last;
$nr =~ s/^.*delete-([0-9]+)$/$1/;
my $new = sprintf("$mars/todo-global/delete-%09d", $nr + 1);
lprint "create symlink $new -> $target\n";
symlink($target, $new);
sub logdelete_res {
my ($cmd, $res) = @_;
my @paths = glob("$mars/resource-$res/log-*") or ldie "cannot find any logfiles\n";
@paths = sort(@paths);
my ($min_deletable, $max_deletable) = _get_deletable_logfiles(@_);
if ($min_deletable >= $max_deletable) {
lprint "no logfiles are deletable.\n";
if ($cmd ne "log-delete-all") {
$max_deletable = $min_deletable + 1; # delete only the first one
my $nr = 0;
for (;;) {
my $first = shift(@paths);
last unless $first;
$nr = $first;
$nr =~ s/^.*log-([0-9]+)-.+$/$1/;
next unless $nr < $max_deletable;
lprint "chosen '$first' for deletion\n";
lprint "removing left-over version symlinks...\n";
foreach my $versionlink (glob("$mars/resource-$res/version-*")) {
my $nrv = $versionlink;
$nrv =~ s/^.*\/version-([0-9]+)-.+$/$1/;
# we need at least one more version link than logfiles for consistency checks
next unless $nrv < $max_deletable - 1;
sub attach_res {
my ($cmd, $res) = @_;
my $detach = ($cmd eq "detach");
my $path = "$mars/resource-$res/todo-$host/attach";
_switch($cmd, $res, $path, !$detach);
sub connect_res {
my ($cmd, $res) = @_;
my $disconnect = ($cmd eq "disconnect");
my @paths = glob("$mars/resource-$res/todo-*/");
for my $path (@paths) {
_switch($cmd, $res, "$path/connect", !$disconnect);
sub connect_local_res {
my ($cmd, $res) = @_;
my $disconnect = ($cmd eq "disconnect-local");
my $path = "$mars/resource-$res/todo-$host/connect";
_switch($cmd, $res, $path, !$disconnect);
sub pause_sync_res {
my ($cmd, $res) = @_;
my $pause = ($cmd eq "pause-sync");
my @paths = glob("$mars/resource-$res/todo-*/");
for my $path (@paths) {
_switch($cmd, $res, "$path/sync", !$pause);
sub pause_sync_local_res {
my ($cmd, $res) = @_;
my $pause = ($cmd eq "pause-sync-local");
my $path = "$mars/resource-$res/todo-$host/sync";
_switch($cmd, $res, $path, !$pause);
sub pause_replay_res {
my ($cmd, $res) = @_;
my $pause = ($cmd eq "pause-replay");
my @paths = glob("$mars/resource-$res/todo-*/");
for my $path (@paths) {
_switch($cmd, $res, "$path/allow-replay", !$pause);
sub pause_replay_local_res {
my ($cmd, $res) = @_;
my $pause = ($cmd eq "pause-replay-local");
my $path = "$mars/resource-$res/todo-$host/allow-replay";
_switch($cmd, $res, $path, !$pause);
sub up_res {
my ($cmd, $res) = @_;
my $down = ($cmd eq "down");
if ($down) {
pause_replay_res("pause-replay", $res);
pause_sync_res("pause-sync", $res);
connect_res("disconnect", $res);
attach_res("detach", $res);
} else {
attach_res("attach", $res);
connect_res("connect", $res);
pause_sync_res("resume-sync", $res);
pause_replay_res("resume-replay", $res);
sub set_replay_res {
my ($cmd, $res, $new_nr) = @_;
if (!$new_nr || $new_nr <= 0) {
ldie "you must supply a numeric logfile number as third argument.\n";
check_todo($cmd, $res, "allow-replay", 0, 0);
my $replaylink = "$mars/resource-$res/replay-$host";
my $old_val = get_link($replaylink);
my $old_nr = $old_val;
$old_nr =~ s/log-([0-9]+)-.*/$1/;
ldie "old log number '$old_nr' is wrong\n" unless $old_nr > 0;
if ($new_nr > $old_nr) {
lwarn "you try to skip logfile numbers from $old_nr to $new_nr, are you sure?\n";
ldie "you would need --force if you really know what you are doing.\n" unless $force;
_set_replaylink("$mars/resource-$res", $new_nr, "");
symlink("$new_nr", "$mars/resource-$res/skip-check-$host");
sub fake_local_res {
my ($cmd, $res) = @_;
my $path = "$mars/resource-$res/todo-$host/sync";
_switch($cmd, $res, $path, 0);
#check_status($res, "copy-syncstatus-$host", 0);
my $size = get_link("$mars/resource-$res/size");
my $target = "$mars/resource-$res/syncstatus-$host";
symlink($size, "$target.tmp") or ldie "cannot create faked syncstatus\n";
rename("$target.tmp", $target) or ldie "cannot reaname symlink\n";
sub _primary_res {
my ($res, $new, $old) = @_;
my $tmp = "$mars/resource-$res/.tmp.primary";
my $pri = "$mars/resource-$res/primary";
system("rm -f $tmp");
symlink($new, $tmp) or ldie "cannot create new primary symlink\n";
rename($tmp, $pri) or ldie "cannot install new primary symlink\n";
lprint "designated primary changed from '$old' to '$new'\n";
sub primary_res {
my ($cmd, $res) = @_;
my $sec = ($cmd eq "secondary");
# we _must_ take the designated primary here, because the actual primary is a _runtime_ condition
my $old = _get_designated_primary($res);
my $new = $host;
if ($sec) {
if ($old eq '(none)') {
lprint "resource '$res' is already designated as secondary everywhere\n";
if (($old ne $host) && !$force && (_get_actual_primary($res) ne $host)) {
ldie "for safety reasons, switching to secondary is only allowed when I ($host) am designated primary or actually primary for resource '$res'\n";
$new = "(none)";
} elsif ($old eq $new) {
lprint "I am already designated primary on resource '$res'.\n";
} elsif ($force) {
lprint "FORCING myself ($host) to be the designated primary...\n";
} elsif (! -d "/proc/sys/mars") {
ldie "cannot switch to primary: mars kernel module is not loaded\n";
} else { # try to switch myself to primary
lprint "trying to switch $new to primary...\n";
check_sync_finished($res, $new);
check_todo($cmd, $res, "connect", 1, 0);
_primary_res($res, "(none)", $old) unless $old eq "(none)";
check_splitbrain($res, $new, -1);
_primary_res($res, $new, $old);
lprint "resource '$res': designated primary successfully changed from $old to $new\n";
sub invalidate_res {
my ($cmd, $res) = @_;
my $old_replay = get_link("$mars/resource-$res/replay-$host");
$old_replay =~ s/^([^,]+),.*/$1/;
my $repl = "$mars/resource-$res/todo-$host/allow-replay";
my $was_on = get_link($repl);
if ($was_on) {
_switch("pause-replay-local", $res, $repl, 0);
lprint "waiting...\n";
my $dst = "$mars/resource-$res/syncstatus-$host";
system("rm -f $dst");
symlink("0", $dst) or ldie "cannot create invalidation symlink '$dst'\n";
my $primary = _get_designated_primary($res);
my $replay = get_link("$mars/resource-$res/replay-$primary");
$replay =~ m/^log-([0-9]+)-/ or ldie "replay link '$replay' is not parsable\n";
my $replay_nr = $1;
_set_replaylink("$mars/resource-$res", $replay_nr, $primary);
if ($was_on) {
lprint "waiting...\n";
_switch("resume-replay-local", $res, $repl, 1);
sub resize_res {
my ($cmd, $res, $size_arg) = @_;
my $new_size = 0;
if ($size_arg) {
$new_size = get_size($size_arg);
ldie "optional size argument '$new_size' must be numeric and positive\n" unless scalar($new_size) > 0;
lprint "new size: $new_size bytes\n";
my @actsizes = glob("$mars/resource-$res/actsize-*");
ldie "resource $res has no actsize-* symlinks\n" unless @actsizes;
my $lnk = "$mars/resource-$res/size";
my $old_size = get_link($lnk);
my $min_size = 0;
foreach my $actsize (@actsizes) {
my $this_size = get_link($actsize);
if (!$min_size || $this_size < $min_size) {
$min_size = $this_size;
lprint "old_size=$old_size\n";
lprint "min_size=$min_size\n";
$new_size = $min_size if !$new_size;
lprint "new_size=$new_size\n";
ldie "new size $new_size is higher than the minimum size of all volumes $min_size" if $new_size > $min_size; # no override with --force possible
# for now, disallow decreasing until some bugs are fixed
ldie "only increases of the size are possible!\n" if $new_size < $old_size;
ldie "only increases of the size are possible without --force\n" if $new_size <= $old_size && !$force;
foreach my $switch (glob("$mars/resource-$res/todo-*/sync")) {
my $this_switch = get_link($switch);
ldie "sync on '$switch' is switched on -- use marsadm pause-sync to stop\n" unless !$this_switch;
my @syncsizes = glob("$mars/resource-$res/syncstatus-$host");
foreach my $syncsize (@syncsizes) {
my $this_size = get_link($syncsize);
ldie "sync on $syncsize has not yet finished: $this_size != $old_size (DANGEROUS FIX: if you know what you are doing, marsadm fake-sync can 'fix' it -- but this may need a full-sync afterwards)\n" unless $this_size == $old_size;
foreach my $syncsize (@syncsizes) {
my $this_size = get_link($syncsize);
symlink($new_size, "$syncsize.new") or ldie "cannot create size symlink '$syncsize.new'\n";
rename("$syncsize.new", $syncsize) or ldie "cannot create size symlink '$syncsize'\n";;
symlink($new_size, "$lnk.new") or ldie "cannot create size symlink '$lnk.new'\n";
rename("$lnk.new", $lnk) or ldie "cannot create size symlink '$lnk'\n";;
sub role_cmd {
my ($cmd, $res) = @_;
my $primary = _get_actual_primary($res) || '(none)';
my $todo_primary = _get_designated_primary($res);
my $msg = "I am actually ";
$msg .= ($primary eq $host) ? "primary" : "secondary";
if ($primary eq $todo_primary) {
$msg .= " and $primary is primary" if ($primary ne $host);
elsif ($primary ne $todo_primary) {
$todo_primary = "I" if ($todo_primary eq $host);
$msg .= " and $todo_primary should be primary";
lprint $msg . "\n";
sub mars_state_cmd {
my ($cmd, $res) = @_;
my $primary = _get_actual_primary($res) || '(none)';
my $todo_primary = _get_designated_primary($res);
if ($primary eq $host) {
lprint "is_primary\n";
elsif ($todo_primary eq $host) {
lprint "becoming_primary\n";
# secondary without ambitions to become primary
my $size = get_link("$mars/resource-$res/size");
my $syncstatus = get_link("$mars/resource-$res/syncstatus-$host");
if ($syncstatus != $size) {
lprint "secondary inconsistent ($syncstatus bytes of $size)\n";
if ($primary eq "(none)") {
my $min = 0;
foreach my $path (glob("$mars/resource-$res/log-*")) {
my $nr = $path;
$nr =~ s:^.*[a-z]+-([0-9]+)(-[^/]*)?$:$1:;
if ($nr > $min) {
$primary = $path;
$primary =~ s:^.*/[a-z]+-[0-9]+-([^/]*)$:$1:;
$min = $nr;
my $primary_replay = get_link("$mars/resource-$res/replay-$primary");
my $host_replay = get_link("$mars/resource-$res/replay-$host");
if ($primary_replay eq $host_replay) {
lprint "secondary uptodate\n";
lprint "secondary outdated ($host_replay instead of $primary_replay)\n";
sub mars_info_cmd {
my ($cmd, $res) = @_;
my $info = "$mars/resource-$res/logstatus-$host.status";
system("cat $info");
sub show_cmd {
my ($cmd, $res) = @_;
$res = "*" if !$res || $res eq "all";
my $glob = "$mars/{ips/ip-$host,alive-$host,emergency-$host,rest-space-$host,resource-$res/{device,primary,size,actsize-$host,syncstatus-$host,replay-$host,actual-$host/*,todo-$host/*}}";
foreach my $link (glob($glob)) {
next unless -l $link;
my $res = get_link($link);
my $short = $link;
$short =~ s:^$mars/::;
lprint "$short=$res\n";
sub helplist {
my $temp;
$temp = shift;
lprint "ERROR: $temp" if ($temp);
lprint "
marsadm [<global_option>] <command> [<resource>] [<option>]
<global_option> =
--force : skip safety checks
use only when you know what you are doing!
--timeout=<seconds> : leave safety checks after timeout with an error
instead of waiting forever
<command> =
cluster : create-cluster join-cluster leave-cluster
resource : create-resource join-resource leave-resource
primary secondary invalidate
up down
replication: connect disconnect connect-local disconnect-local
replay : pause-replay resume-replay pause-replay-local resume-replay-local
sync : pause-sync resume-sync pause-sync-local resume-sync-local fake-sync
logfile : log-rotate log-delete log-delete-all
device : attach detach
internal : help version
<resource> = name of resource or \"all\" for all resources
<option> = special by command
Advanced information are also available here: http://http://wiki.intranet.1and1.com/bin/view/PO/ProjektTEC1603
exit 0;
sub version {
lprint "$0 $Id\n";
#lprint "my IP is $ip\n";
exit 0;
my %cmd_table =
# new keywords
"create-cluster" => \&create_cluster,
"join-cluster" => \&join_cluster,
"leave-cluster" => \&leave_cluster,
"create-resource" => \&create_res,
"join-resource" => \&create_res,
"leave-resource" => \&leave_res,
"log-rotate" => \&logrotate_res,
"log-delete" => \&logdelete_res,
"log-delete-all" => \&logdelete_res,
"fake-sync" => \&fake_local_res,
"mars-state" => \&mars_state_cmd,
"mars-info" => \&mars_info_cmd,
"show" => \&show_cmd,
"pause-replay" => \&pause_replay_res,
"resume-replay" => \&pause_replay_res,
"pause-replay-local" => \&pause_replay_local_res,
"resume-replay-local" => \&pause_replay_local_res,
"set-replay" => \&set_replay_res,
# compatible keywords (or their derivatives)
"attach" => \&attach_res,
"detach" => \&attach_res,
"connect" => \&connect_res,
"disconnect" => \&connect_res,
"connect-local" => \&connect_local_res,
"disconnect-local" => \&connect_local_res,
"syncer" => \&ignore_cmd,
"up" => \&up_res,
"down" => \&up_res,
"primary" => \&primary_res,
"secondary" => \&primary_res,
"invalidate" => \&invalidate_res,
"invalidate-remote" => \&forbidden_cmd,
"resize" => \&resize_res,
"create-md" => \&senseless_cmd,
"get-gi" => \&ignore_cmd,
"show-gi" => \&ignore_cmd,
"dump-md" => \&senseless_cmd,
"outdate" => \&ignore_cmd,
"adjust" => \&ignore_cmd,
"wait-connect" => \&nyi_cmd,
"role" => \&role_cmd,
"state" => \&role_cmd,
"cstate" => \&nyi_cmd,
"dstate" => \&nyi_cmd,
"status" => \&nyi_cmd,
"dump" => \&senseless_cmd,
"verify" => \&nyi_cmd,
"pause-sync" => \&pause_sync_res,
"resume-sync" => \&pause_sync_res,
"pause-sync-local" => \&pause_sync_local_res,
"resume-sync-local" => \&pause_sync_local_res,
"new-current-uuid" => \&senseless_cmd,
"hidden-commands" => \&ignore_cmd,
my @args;
foreach my $arg (@ARGV) {
if ($arg eq "--force") {
} elsif ($arg =~ s/--timeout\s*=\s*([0-9]+)/$1/) {
$timeout = $arg;
if ($arg =~ s/^force-//) {
push @args, $arg;
my $cmd = shift @args || helplist "command argument is missing\n";
$notify = "(cmd: $cmd)" unless $cmd eq "version";
if ($cmd =~ m/^help$/ || $cmd =~ m/^h$/) {
if ($cmd =~ m/^version$/ || $cmd =~ m/^v$/) {
ldie "only root may use this tool\n" if $< != 0; # getpid() seems to be missing in perlfunc
helplist "unknown command $cmd\n" if !exists $cmd_table{$cmd};
my $res = "";
if ($cmd eq "show") {
$res = shift @args;
} elsif (!($cmd =~ m/^(create|leave)-cluster$/)) {
$res = shift @args || helplist "resource argument is missing\n";
lprint "using FORCE option -- hopefully you know what you do!\n" if $force;
sub do_res {
my $cmd = shift;
my $res = shift;
$res = check_res($res) unless $cmd =~ m/^(join|create|leave)-cluster|create-resource|show$/;
check_res_member($res) unless $cmd =~ m/^(join|create)-(cluster|resource)|leave-cluster|show$/;
my $func = $cmd_table{$cmd};
&{$func}($cmd, $res, @_);
if ($res eq "all" && $cmd ne "show") {
ldie "For safty reasons, --force is only allowed on explicitly named resources. Combination of 'all' with --force is disallowed!\n" if $force;
foreach $res (glob("$mars/resource-*")) {
next unless -e "$res/data-$host";
$res =~ s/^.*\/resource-(.*)$/$1/;
lprint "--------- resource $res\n";
do_res($cmd, $res, @args);
} else {
do_res($cmd, $res, @args);