mars/userspace/marsadm

566 lines
18 KiB
Perl

#!/usr/bin/perl -w
# (c) 2010 Thomas Schoebel-Theuer / 1&1 Internet AG
use strict;
use English;
use warnings;
my $mars = "/mars";
my $host = `uname -n` or die "cannot determine my network node name\n";
chomp $host;
my $ip = `ip a` or die "cannot determine my IP address\n";
$ip =~ s/\A.*inet +(?!127\.0\.)([0-9]+\.[0-9]+\.[0-9]+\.[0-9]+).*?\Z/$1/ms or die "cannot parse my IP address\n";
chomp $ip;
print "my IP is $ip\n";
umask 0077;
die "only root may use this tool\n" unless `whoami` eq "root\n"; # getpid() seems to be missing in perlfunc
##################################################################
# syntactic checks
sub check_id {
my $str = shift;
die "identifier '$str' has disallowed characters" unless $str =~ m/^[A-Za-z_][-A-Za-z0-9_]*$/;
die "identifier '$str' is too long (only 16 chars allowed)" if length($str) > 16;
}
##################################################################
# semantic checks
sub check_res {
my $res = shift;
if(not -d "$mars/resource-$res") {
# DO WHAT I MEAN: try to substitute a device name for a badly given resource name if it is unique
my $count = 0;
my $found;
my @tests = glob("$mars/resource-*/device-$host");
foreach my $test (@tests) {
my $target = readlink($test);
if($target eq $res) {
$found = $test;
$count++;
}
}
if(!$count) {
@tests = glob("$mars/resource-*/_direct-*-$host");
foreach my $test (@tests) {
my $target = readlink($test);
$target =~ s/^.*,//;
if($target eq $res) {
$found = $test;
$count++;
}
}
}
die "resource '$res' does not exist ($count replacements found)\n" unless $count == 1 and $found;
$found =~ s:^.*/resource-(.*)/.*$:$1:;
warn "substituting bad resource name '$res' by uniquely matching resource name '$found'\n";
$res = $found;
}
return $res;
}
sub check_res_member {
my $res = shift;
die "sorry, I have not yet joined to resource '$res'\n" unless -e "$mars/resource-$res/data-$host";
}
sub check_sync_finished {
my ($res, $host) = @_;
my $lnk = "$mars/resource-$res/syncstatus-$host";
if(lstat($lnk)) {
my $syncstatus = readlink($lnk);
my $size = readlink("$mars/resource-$res/size") or die "cannot read size\n";
die "sync has not yet finished, only $syncstatus / $size bytes transferred\n" unless $syncstatus >= $size;
}
print "OK, it seems that sync has finished on $host.\n";
}
sub check_primary_gone {
my ($res) = @_;
for(;;) {
my @links = glob("$mars/resource-$res/actual-*/is-primary");
my $found = 0;
foreach my $link (@links) {
my $val = readlink($link);
$found++ if $val;
}
last if !$found;
print "waiting for $found other primary host(s) to disappear....\n";
sleep(5);
}
}
sub check_status {
my ($res, $key, $val) = @_;
for(;;) {
my $path = "$mars/resource-$res/actual-$host/$key";
my $link = readlink($path) or die "cannot read symlink '$path'\n";
last if $link == $val;
print "waiting until $key reaches the value $val....\n";
sleep(5);
}
}
sub check_splitbrain {
my ($res, $host) = @_;
my @links = glob("$mars/resource-$res/version-[0-9]*-$host");
die "no version information available for host $host\n" unless @links;
my $link = pop @links;
die "cannot pop last link element\n" unless $link;
my $version = readlink($link);
die "cannot get version information for $link\n" unless $version;
my $serial = $link;
$serial =~ s:$mars/resource-$res/version-([0-9]+).*$:$1:;
print "my version: $link -> $version\n";
foreach my $other (glob("$mars/resource-$res/version-[0-9]*-*")) {
$other =~ m:$mars/resource-$res/version-([0-9]+)-(.*)$:;
my $other_serial = $1;
my $other_host = $2;
next if $serial ne $other_serial;
next if $host eq $other_host;
my $other_version = readlink($other);
print "other version: $other -> $other_version\n";
die "split brain detected: $other_version != $version\n" unless $version eq $other_version;
}
}
sub get_size {
my $arg = shift;
return -1 unless $arg =~ m/^[0-9.]+[kmgtp]?$/i;
my $mod = $arg;
$arg =~ s/[^0-9]+$//;
$mod =~ s/^[0-9]+//;
$_ = $mod;
SWITCH: {
/k/i and $arg *= 1024, last SWITCH;
/m/i and $arg *= 1024 * 1024, last SWITCH;
/g/i and $arg *= 1024 * 1024 * 1024, last SWITCH;
/t/i and $arg *= 1024 * 1024 * 1024 * 1024, last SWITCH;
/p/i and $arg *= 1024 * 1024 * 1024 * 1024 * 1024, last SWITCH;
}
return $arg;
}
##################################################################
# helpers
sub _trigger {
system("(echo 1 > /proc/sys/mars) >/dev/null 2>&1");
}
sub _switch {
my ($cmd, $res, $path, $on) = @_;
my $src = $on ? "1" : "0";
my $old = readlink($path);
if($old && $old eq $src) {
print "${cmd} on resource $res is already activated\n" if $cmd;
return;
}
my $tmp = $path;
$tmp =~ s/\/([^\/]+)$/.tmp.$1/;
symlink($src, $tmp) or die "cannot create switch symlink\n";
rename($tmp, $path) or die "cannot rename switch symlink\n";
print "successfully started ${cmd} on resource $res\n" if $cmd;
}
sub _writable {
my ($path, $on) = @_;
my $oldmode = (lstat $path)[2] & 0700;
my $newmode = $on ? $oldmode | 0200 : $oldmode & ~0200;
print "chmod '$path' $oldmode $newmode";
chmod($newmode, $path) == 1 or die "cannot chmod '$path'\n";
}
##################################################################
# commands
sub ignore_cmd {
my ($cmd, $res) = @_;
print "ignoring command '$cmd' on resource '$res'\n";
exit(0);
}
sub senseless_cmd {
my ($cmd, $res) = @_;
print "command '$cmd' makes no sense with MARS (ignoring)\n";
exit(0);
}
sub forbidden_cmd {
my ($cmd, $res) = @_;
die "command '$cmd' cannot be used with MARS (it is impossible to carry out uniquely and could therefore lead to a disaster)\n";
}
sub nyi_cmd {
my ($cmd, $res) = @_;
die "command '$cmd' is not yet implemented\n";
}
sub _create_cluster {
my ($cmd) = @_;
system("mkdir $mars") unless -d $mars;
system("mkdir $mars/ips") unless -d "$mars/ips";
system("mkdir $mars/defaults") unless -d "$mars/defaults";
system("mkdir $mars/defaults-$host") unless -d "$mars/defaults-$host";
symlink($ip, "$mars/ips/ip-$host");
}
sub create_cluster {
my ($cmd, $peer, $force) = @_;
die "cluster is already created\n" if !$force && -d "$mars/ips";
_create_cluster(@_);
}
sub join_cluster {
my ($cmd, $peer, $force) = @_;
if(glob("$mars/resource-*")) {
die "Sorry, some resources already exist!\nThis is dangerous!\nIf you are sure that no resource clash is possible, re-invoke this command with '--force' option\n" unless ($force and $force =~ m/--force/);
}
print "joining cluster via rsync (peer='$peer')\n";
_create_cluster(@_);
system("rsync --recursive --links -v $peer:$mars/ips/ $mars/ips/") == 0 or die "oops\n";
symlink($ip, "$mars/ips/ip-$host");
system("rsync --recursive --links -v $mars/ips/ $peer:$mars/ips/") == 0 or die "oops\n";
}
sub create_res {
my ($cmd, $res, $dev, $appear) = @_;
my $create = ($cmd eq "create-resource");
die "undefined device or size argument\n" unless $dev;
$appear = $res if !$appear;
check_id($appear) if $appear;
if($create) {
die "resource '$res' already exists\n" if -d "$mars/resource-$res";
print "creating new resource '$res'\n";
} else {
print "joining to existing resource '$res'\n";
}
my $size = get_size($dev);
if($size > 0) {
$dev = "";
} else {
die "block device '$dev' does not exist\n" unless -b $dev;
die "block device '$dev' must be an absolute path starting with '/'\n" unless $dev =~ m/^\//;
use Fcntl 'SEEK_END';
open(TEST, "<$dev") or die "cannot open device for reading\n";
$size = sysseek(TEST, 0, SEEK_END);
close(TEST);
print "device size = $size bytes\n";
die "implausible size" unless $size > 0;
}
my $tmp = "$mars/.tmp.$res";
my $primary;
my $replay;
if($create) {
_create_cluster(@_);
system("rm -rf $tmp");
system("mkdir $tmp") == 0 or die "could not create resource '$res'\n";
symlink($size, "$tmp/size") or die "cannot create size indicator symlink\n";
} else {
$tmp = "$mars/resource-$res";
die "resource '$res' does not exist\n" unless -d $tmp;
$primary = readlink("$tmp/primary") or die "cannot determine primary\n";
if($primary eq "(none)") {
my @list = glob("$tmp/replay-*") or die "cannot find any candidate for primary\n";
my $first = pop @list or die "bad glob list\n";
$primary = readlink($first) or die "cannot determine peer\n";
$primary =~ s/^log-[0-9]+-(.*),.*,.*$/$1/;
print "using '$primary' as primary\n";
}
die "resource '$res' is already joined\n" if -e "$tmp/data-$host";
die "my ip '$ip' is not registered -- please run 'join-cluster' first\n" unless -l "$mars/ips/ip-$host";
my $oldsize = readlink("$tmp/size") or die "cannot determine old size\n";
if($size < $oldsize) {
print "adjusting size to $oldsize\n";
$size = $oldsize;
}
die "sizes differ: real size = $oldsize, but requested size = $size\n" unless $oldsize == $size;
$replay = readlink("$tmp/replay-$primary") or die "cannot read replay status of primary '$primary'\n";
$replay =~ s/-$primary,[0-9]+,[0-9]+/-$host,0,0/ or die "something is wrong here\n";
my $nr = $replay;
$nr =~ s/^.*log-([0-9]+)-.+$/$1/;
if($nr > 1) { # fake version symlink for predecessor logfile
my $prev = sprintf("$tmp/version-%09d-$primary", $nr - 1);
my $version = readlink($prev) or die "cannot read symlink '$prev'\n";
my $old = sprintf("$tmp/version-%09d-$host", $nr - 1);
symlink($version, $old) or die "cann create symlink '$old'\n";
}
}
my $file = "$tmp/data-$host";
if(!$dev) {
print "creating sparse file '$file' with size $size\n";
open(OUT, ">$file") or die "could not open '$file'\n";
use Fcntl 'SEEK_SET';
sysseek(OUT, $size-1, SEEK_SET) == $size-1 or die "could not seek\n";
syswrite(OUT, '\0', 1) == 1 or die "cannot init sparse file\n";
close OUT;
} else {
print "using existing device '$dev'\n";
symlink($dev, $file) or die "cannot create device symlink\n";
}
if($appear) {
# TODO: check for uniqeness of $appear
print "resource '$res' will appear as local device '/dev/mars/$appear'\n";
system("rm -f $tmp/device-$host");
symlink($appear, "$tmp/device-$host") or die "cannot create symlink for local device appearance\n";
}
mkdir("$tmp/defaults") unless -d "$tmp/defaults";
mkdir("$tmp/defaults-$host");
mkdir("$tmp/actual-$host");
my $todo = "$tmp/todo-$host";
mkdir($todo);
symlink("1", "$todo/attach");
symlink("0", "$todo/connect");
symlink("1", "$todo/sync");
system("rm -f $tmp/syncstatus-$host");
if($create) {
symlink($host, "$tmp/primary") or die "cannot create primary symlink\n";
symlink($size, "$tmp/syncstatus-$host") or die "cannot create primary syncstatus\n";
symlink("log-000000001-$host,0,0", "$tmp/replay-$host") or die "cannot create replay status\n";
system("touch $tmp/log-000000001-$host");
rename($tmp, "$mars/resource-$res") or die "cannot finalize resource '$res'\n";
print "successfully created resource '$res'\n";
} else {
system("rm -f $tmp/replay-$host");
symlink($replay, "$tmp/replay-$host") or die "cannot create replay status\n";
symlink("0", "$tmp/syncstatus-$host") or die "cannot start initial sync\n";
system("rm -f $tmp/connect-$host");
symlink($primary, "$tmp/connect-$host") or die "cannot create peer connect symlink\n";
symlink($host, "$tmp/connect-$primary") unless -l "$tmp/connect-$primary";
print "successfully joined resource '$res'\n";
}
}
sub logrotate_res {
my ($cmd, $res) = @_;
my @paths = glob("$mars/resource-$res/log-*-$host") or die "cannot find any logfiles\n";
@paths = sort(@paths);
my $last = pop(@paths);
if(-z $last) {
print "an empty logfile '$last' already exists, nothing to do.\n";
return;
}
my $nr = $last;
$nr =~ s/^.*log-([0-9]+)-.+$/$1/;
my $next = sprintf("$mars/resource-$res/log-%09d-$host", $nr + 1);
die "logfile '$next' already exists\n" if -e $next;
system("touch $next");
}
sub attach_res {
my ($cmd, $res) = @_;
my $detach = ($cmd eq "detach");
my $path = "$mars/resource-$res/todo-$host/attach";
_switch($cmd, $res, $path, !$detach);
}
sub connect_res {
my ($cmd, $res) = @_;
my $disconnect = ($cmd eq "disconnect");
my @paths = glob("$mars/resource-$res/todo-*/connect");
for my $path (@paths) {
_switch($cmd, $res, $path, !$disconnect);
}
}
sub connect_local_res {
my ($cmd, $res) = @_;
my $disconnect = ($cmd eq "disconnect-local");
my $path = "$mars/resource-$res/todo-$host/connect";
_switch($cmd, $res, $path, !$disconnect);
}
sub pause_res {
my ($cmd, $res) = @_;
my $pause = ($cmd eq "pause-sync");
my @paths = glob("$mars/resource-$res/todo-*/sync");
for my $path (@paths) {
_switch($cmd, $res, $path, !$pause);
}
}
sub pause_local_res {
my ($cmd, $res) = @_;
my $pause = ($cmd eq "pause-sync-local");
my $path = "$mars/resource-$res/todo-$host/sync";
_switch($cmd, $res, $path, !$pause);
}
sub up_res {
my ($cmd, $res) = @_;
my $down = ($cmd eq "down");
if($down) {
pause_res("pause-sync", $res);
connect_res("disconnect", $res);
attach_res("detach", $res);
} else {
attach_res("attach", $res);
connect_res("connect", $res);
pause_res("resume-sync", $res);
}
}
sub fake_local_res {
my ($cmd, $res) = @_;
my $path = "$mars/resource-$res/todo-$host/sync";
_switch($cmd, $res, $path, 0);
#check_status($res, "copy-syncstatus-$host", 0);
my $size = readlink("$mars/resource-$res/size") or die "cannot read size\n";
my $target = "$mars/resource-$res/syncstatus-$host";
symlink($size, "$target.tmp") or die "cannot create faked syncstatus\n";
rename("$target.tmp", $target) or die "cannot reaname symlink\n";
}
sub _primary_res {
my ($res, $host, $pri, $old) = @_;
my $tmp = "$mars/resource-$res/.tmp.primary";
system("rm -f $tmp");
symlink($host, $tmp) or die "cannot create new primary symlink\n";
rename($tmp, $pri) or die "cannot install new primary symlink\n";
print "primary changed from '$old' to '$host'\n";
}
sub primary_res {
my ($cmd, $res) = @_;
my $sec = ($cmd eq "secondary");
my $pri = "$mars/resource-$res/primary";
my $old = readlink($pri) or die "cannot determine current primary\n";
if($sec) {
die "for safety reasons, switching to secondary is only allowed when I ($host) am primary\n" if($old ne $host);
$host = "(none)";
} elsif($old eq $host) {
print "I am already primary.\n";
exit(0);
} elsif($cmd eq "force-primary") {
print "forcing myself ($host) to primary...\n";
} else { # try to switch myself to primary
print "trying to switch $host to primary...\n";
check_sync_finished($res, $host);
_primary_res($res, "(none)", $pri, $old) unless $old eq "(none)";
check_primary_gone($res);
check_splitbrain($res, $host);
}
_primary_res($res, $host, $pri, $old);
}
sub invalidate_res {
my ($cmd, $res) = @_;
my $pri = "$mars/resource-$res/primary";
my $old = readlink($pri) or die "cannot determine current primary\n";
my $dst = "$mars/resource-$res/syncstatus-$host";
die "primary side cannot be invalidated\n" if $old eq $host;
system("rm -f $dst");
symlink("0", $dst) or die "cannot create invalidation symlink '$dst'\n";
}
sub role_cmd {
my ($cmd, $res) = @_;
my $pri = "$mars/resource-$res/primary";
my $old = readlink($pri) or die "cannot determine current primary\n";
if($old eq $host) {
print "primary\n";
} else {
print "secondary\n";
}
}
##################################################################
my %cmd_table =
(
# new keywords
"create-cluster" => \&create_cluster,
"join-cluster" => \&join_cluster,
"create-resource" => \&create_res,
"join-resource" => \&create_res,
"logrotate" => \&logrotate_res,
"fake-sync" => \&fake_local_res,
# compatible keywords
"attach" => \&attach_res,
"detach" => \&attach_res,
"connect" => \&connect_res,
"disconnect" => \&connect_res,
"connect-local" => \&connect_local_res,
"disconnect-local" => \&connect_local_res,
"syncer" => \&ignore_cmd,
"up" => \&up_res,
"down" => \&up_res,
"primary" => \&primary_res,
"secondary" => \&primary_res,
"force-primary" => \&primary_res,
"invalidate" => \&invalidate_res,
"invalidate-remote" => \&forbidden_cmd,
"resize" => \&nyi_cmd,
"create-md" => \&senseless_cmd,
"get-gi" => \&ignore_cmd,
"show-gi" => \&ignore_cmd,
"dump-md" => \&senseless_cmd,
"outdate" => \&ignore_cmd,
"adjust" => \&ignore_cmd,
"wait-connect" => \&nyi_cmd,
"role" => \&role_cmd,
"state" => \&role_cmd,
"cstate" => \&nyi_cmd,
"status" => \&nyi_cmd,
"dump" => \&senseless_cmd,
"verify" => \&nyi_cmd,
"pause-sync" => \&pause_res,
"resume-sync" => \&pause_res,
"pause-sync-local" => \&pause_local_res,
"resume-sync-local" => \&pause_local_res,
"new-current-uuid" => \&senseless_cmd,
"dstate" => \&nyi_cmd,
"hidden-commands" => \&ignore_cmd,
);
my $cmd = shift || die "command argument is missing\n";
die "unknown command '$cmd'\n" if !exists $cmd_table{$cmd};
my $res = "";
unless($cmd =~ m/^create-cluster$/) {
$res = shift || die "resource argument is missing\n";
check_id($res);
}
sub do_res {
my $cmd = shift;
my $res = shift;
$res = check_res($res) unless $cmd =~ m/^(join|create)-cluster|create-resource$/;
check_res_member($res) unless $cmd =~ m/^(join|create)-(cluster|resource)$/;
my $func = $cmd_table{$cmd};
&{$func}($cmd, $res, @_);
}
if($res eq "all") {
foreach $res (glob("$mars/resource-*")) {
next unless -e "$res/data-$host";
$res =~ s/^.*\/resource-(.*)$/$1/;
print "--------- resource $res\n";
do_res($cmd, $res, @ARGV);
}
} else {
do_res($cmd, $res, @ARGV);
}
_trigger();