#!/usr/bin/perl -w # (c) 2010 Thomas Schoebel-Theuer / 1&1 Internet AG use strict; use English; use warnings; umask 0077; ################################################################## # messaging my $error_count = 0; my $notify = ""; sub lprint { my ($text) = @_; print $text; if ($notify) { system("/usr/bin/logger -t marsadm \"$notify $text\""); } } sub ldie { my ($text) = @_; $error_count++; lprint "DYING: $text"; die "\n"; } sub lwarn { my ($text) = @_; lprint "WARNING: $text"; } ################################################################## # low-level infrastructure my @link_list = (); my %link_hash; my $verbose = 0; sub get_link { my ($path, $unchecked) = @_; my $result = readlink($path); if (!defined($result)) { ldie "cannot read symlink '$path'\n" unless $unchecked; lwarn "cannot read symlink '$path'\n" if $unchecked == 1; } return $result; } sub to_tmp { my $path = shift; $path =~ s:^(.*)/:$1/.tmp.:; return $path; } sub from_tmp { my $path = shift; $path =~ s:^(.*)/\.tmp\.:$1/:; return $path; } sub set_link { my ($src, $dst) = @_; my $dst_tmp = to_tmp($dst); unlink($dst_tmp); symlink($src, $dst_tmp) or ldie "cannot create symlink '$dst' -> '$src'\n"; # the _order_ is important! remove existing intermediate element before re-appanding if (exists($link_hash{$dst})) { my @copy = @link_list; @link_list = (); foreach my $elem (@copy) { next if $elem eq $dst; push @link_list, $elem; } } $link_hash{$dst} = $src; push @link_list, $dst; } sub finish_links { return unless @link_list; my $timestamp = time(); undef $/; my $lamport = ""; if (open(my $fh, "<", "/proc/sys/mars/lamport_clock")) { $lamport = <$fh>; } if ($lamport =~ m/lamport_now=([0-9.]+)/) { $timestamp = $1; lprint "using lamport timestamp $timestamp\n" if $verbose; } while (my $link = shift @link_list) { my $link_tmp = to_tmp($link); system("touch -h -d \"\@$timestamp\" $link_tmp") == 0 or ldie "cannot set mtime on symlink '$link_tmp'\n"; rename($link_tmp, $link) or ldie "cannot finalize symlink '$link'\n"; if ($verbose) { my $target = readlink($link); lprint "created symlink '$link' -> '$target'\n"; } } _trigger(); } ################################################################## # global variables and checks my $Id = '$Id$ '; my $user_version = 0.1; my $mars = "/mars"; my $host = `uname -n` or ldie "cannot determine my network node name\n"; chomp $host; my $force = 0; my $timeout = -1; my $ip = _get_ip() or ldie "cannot determine my IP address\n"; my $kernel_version = 0; unless ($ARGV[0] =~ m/cluster|cat/) { $kernel_version = get_link("$mars/tree-$host", 1); if ($kernel_version && $user_version != $kernel_version) { lwarn "kernel_version=$kernel_version user_version=$user_version\n"; if ($user_version < $kernel_version) { ldie "Sorry, your MARS kernel module uses version $kernel_version, but my $0 userspace version is only $user_version. That cannot work. Please upgrade your userspace scripts!\n"; } elsif (int($user_version) > int($kernel_version) + 1) { ldie "MAJOR VERSION mismatch : only one major upgrade step is supported.\n"; } lwarn "using different minor versions is possible, but you should upgrade your kernel module ASAP\n"; } } sub get_alive_links { my $res = shift || "all"; my $alive = shift || "alive"; my $glob = "$mars/$alive-*"; if ($res ne "all") { $glob = "$mars/$alive-{"; my $count = 0; foreach my $peer (glob("$mars/resource-$res/connect-*")) { $peer =~ m:/connect-(.+):; $glob .= "," if $count++; $glob .= $1; } $glob .= "}"; } my %links; foreach my $path (glob($glob)) { $path =~ m:/$alive-(.+):; my $peer = $1; $links{$peer} = get_link($path); } return %links; } ################################################################## # timeout handling # # return the lamport clock time in nanosecond resolution # fallback to system time() # sub mars_time { open(my $lamport_clock, "<", "/proc/sys/mars/lamport_clock"); my $lamport_time; while (<$lamport_clock>) { $lamport_time = $1 if /^lamport_now=(.*)/; } close($lamport_clock); return $lamport_time || time() . '0' x 9; } sub sleep_timeout { my $sleeptime = shift || 5; if ($timeout < 0) { sleep($sleeptime); return; } ldie "Timeout reached. You may retry with --timeout=-1 to ensure waiting until progress is possible.\n" if !$timeout; my $rest = $timeout; $rest = $sleeptime if $rest > $sleeptime; sleep($rest); $timeout -= $rest; } # wait for some condition sub wait_cond { my ($cmd, $res, $specific) = @_; my $is_actual = ($specific =~ s/^(is|has)-//); my $is_on = !($specific =~ s/-off$//); $specific =~ s/-on$//; if ($is_actual) { if ($specific eq "device") { check_mars_device($cmd, $res, 1, !$is_on); return; } my %table = ( "attach" => "is-attached", "attached" => "is-attached", "replay" => "is-replaying", "replaying"=> "is-replaying", "copy" => "is-copying", "copying" => "is-copying", "sync" => "is-syncing", "syncing" => "is-syncing", "primary" => "is-primary", ); my $name = $table{$specific}; ldie "actual indicator '$specific' does not exist\n" unless exists($table{$specific}); check_status($cmd, $res, $name, $is_on ? 1 : 0, 1); } else { my %table = ( "attach" => "attach", "attached" => "attach", "connect" => "connect", "replay" => "allow-replay", "sync" => "sync", ); my $name = $table{$specific}; ldie "button '$specific' does not exist\n" unless exists($table{$specific}); check_todo($cmd, $res, $name, $is_on ? 1 : 0, 1); } } # wait until some communication has occurred sub wait_cluster { return wait_cond(@_) if int(@_) >= 3; my $cmd = shift; my $res = shift || "all"; my $start_time = mars_time(); _trigger(); my $delta = $timeout > 0 ? $timeout : 30; while (1) { my $dead_count = 0; my $alive_count = 0; my $unknown_count = 0; my %status = get_alive_links($res, "time"); my $now = mars_time(); foreach my $peer (keys(%status)) { next if $peer eq $host; if ($status{$peer} >= $start_time) { $alive_count++; } elsif ($status{$peer} + $delta < $now) { $dead_count++; } else { $unknown_count++; } } if (!$dead_count && !$unknown_count) { lprint "all $alive_count peer(s) seem to be alive\n"; last; } if (!$unknown_count) { lwarn "$alive_count peer(s) seem to be alive, and $dead_count peer(s) seem to be dead / not reachable\n"; ldie "--force not given\n" unless $force; last; } sleep(1); } } ################################################################## # syntactic checks sub check_id { my $str = shift; ldie "identifier '$str' has disallowed characters" unless $str =~ m/^[A-Za-z_][-A-Za-z0-9_]*$/; ldie "identifier '$str' is too long (only 16 chars allowed)" if length($str) > 16; } ################################################################## # semantic checks sub check_res { my $res = shift; if (not -d "$mars/resource-$res") { # DO WHAT I MEAN: try to substitute a device name for a badly given resource name if it is unique my $count = 0; my $found; my @tests = glob("$mars/resource-*/device-$host"); foreach my $test (@tests) { my $target = get_link($test, 2); if ($target eq $res) { $found = $test; $count++; } } if (!$count) { @tests = glob("$mars/resource-*/_direct-*-$host"); foreach my $test (@tests) { my $target = get_link($test, 2); $target =~ s/^.*,//; if ($target eq $res) { $found = $test; $count++; } } } ldie "resource '$res' does not exist ($count replacements found)\n" unless $count == 1 and $found; $found =~ s:^.*/resource-(.*)/.*$:$1:; lwarn "substituting bad resource name '$res' by uniquely matching resource name '$found'\n"; $res = $found; } return $res; } sub check_sizes { my ($res, $host) = @_; my $logical_size = get_link("$mars/resource-$res/size"); my $physical_size = get_link("$mars/resource-$res/actsize-$host", 1); if (defined($physical_size) && $physical_size < $logical_size) { ldie "physical device on host '$host' has size $physical_size, which is smaller than the logical resource size $logical_size\n"; } } sub check_res_member { my $res = shift; ldie "sorry, I have not yet joined to resource '$res'\n" unless -e "$mars/resource-$res/data-$host"; check_sizes($res, $host); } sub check_sync_finished { my ($res, $host) = @_; check_sizes(@_); my $lnk = "$mars/resource-$res/syncstatus-$host"; if (lstat($lnk)) { my $syncstatus = get_link($lnk, 1); my $size = get_link("$mars/resource-$res/size"); ldie "sync has not yet finished, only $syncstatus / $size bytes transferred\n" unless $syncstatus >= $size; } lprint "OK, it seems that sync has finished on $host.\n"; } sub check_primary { my ($cmd, $res) = @_; my $lnk = "$mars/resource-$res/actual-$host/is-primary"; my $is_primary = get_link($lnk); if (!$is_primary) { # give it a second chance my $name = get_link("$mars/resource-$res/device-$host"); my $dev = "/dev/mars/$name"; $is_primary = 1 if -b $dev; } ldie "for operation '$cmd' I need to be primary\n" unless $is_primary; my $primary = _get_designated_primary($res); ldie "for operation '$cmd', I also must be the designated primary\n" unless $primary eq $host; } sub check_not_primary { my ($cmd, $res) = @_; my $lnk = "$mars/resource-$res/actual-$host/is-primary"; my $is_primary = get_link($lnk); if ($is_primary) { ldie "operation '$cmd' cannot be executed on primary\n"; } # also check whether we intend to become primary my $primary = _get_designated_primary($res); ldie "operation '$cmd' cannot be executed on designated primary\n" if $primary eq $host; } sub check_primary_gone { my ($res) = @_; for (;;) { my $pri = _get_actual_primary($res); last if !$pri; last if $pri eq $host; lprint "waiting for other primary host ($pri) to disappear....\n"; sleep_timeout(); } } sub check_todo { my ($cmd, $res, $key, $val, $wait, $unchecked, $inv) = @_; my $path = "$mars/resource-$res/todo-$host/$key"; my $link; for (;;) { $link = get_link($path, $unchecked); return unless defined($link); if (defined($inv) && $inv) { last if $link != $val; ldie "cannot execute $cmd: switch '$key' must not have value '$val'\n" if !$wait; lprint "waiting until switch '$key' leaves the value '$val'....\n"; } else { last if $link == $val; ldie "cannot execute $cmd: switch '$key' must have value '$val', but actually has value '$link'\n" if !$wait; lprint "waiting until switch '$key' reaches the value '$val'....\n"; } sleep_timeout(); } lprint "OK, '$path' has acceptable value '$link'\n"; } sub check_status { my ($cmd, $res, $key, $val, $wait, $unchecked, $inv) = @_; my $path = "$mars/resource-$res/actual-$host/$key"; my $link; for (;;) { $link = get_link($path, $unchecked); return unless defined($link); if (defined($inv) && $inv) { last if $link != $val; ldie "cannot execute $cmd: '$path' must not have value '$val'\n" if !$wait; lprint "waiting until '$key' leaves the value '$val'...\n"; } else { last if $link == $val; ldie "cannot execute $cmd: '$path' must have value '$val'\n" if !$wait; lprint "waiting until '$key' reaches the value '$val'...\n"; } sleep_timeout(); } lprint "OK, '$path' has acceptable value '$link'\n"; } sub check_mars_device { my ($cmd, $res, $wait, $inv) = @_; my $name = get_link("$mars/resource-$res/device-$host"); my $dev = "/dev/mars/$name"; my $backoff = 1; my $round = 0; if ($inv) { while (-b $dev) { ldie "cannot execute $cmd: device '$dev' has not yet disappeared\n" if !$wait; lwarn "device '$dev' has not yet disappeared\n"; sleep_timeout($backoff); # very slowly increasing backoff if ($backoff < 10 && $round++ > 5) { $round = 0; $backoff++; } } lprint "device '$dev' is no longer present\n" unless -b $dev; return; } # !$inv while (! -e $dev) { my $text = get_error_text($cmd, $res); lprint $text if $text; ldie "aborting due to errors\n" if $text =~ m/error/mi; ldie "cannot execute $cmd: device '$dev' not yet present\n" if !$wait; lprint "device '$dev' not yet present\n"; sleep_timeout($backoff); # very slowly increasing backoff if ($backoff < 10 && $round++ > 5) { $round = 0; $backoff++; } } lprint "device '$dev' is present\n" if -b $dev; } sub check_userspace { my ($dst) = @_; if ($dst !~ m:/userspace/:) { ldie "your path '$dst' must be inside $mars/userspace/ or $mars/resource-*/userspace/\n" unless $force; lwarn "your path '$dst' is outside $mars/userspace/ or $mars/resource-*/userspace/ and you gave --force, hopefully YOU KNOW WHAT YOU ARE DOING\n"; } } ################################################################## # state inspection routines sub _get_minmax { my ($res, $glob, $take_symlink) = @_; my $min = -1; my $max = -1; my @paths = glob($glob) or ldie "cannot find '$glob'\n"; foreach my $path (@paths) { my $nr = $path; if ($take_symlink) { $nr = get_link($path); } $nr =~ s:^.*[a-z]+-([0-9]+)(-[^/]*)?$:$1:; $min = $nr if ($nr < $min || $min < 0); $max = $nr if ($nr > $max || $max < 0); } return ($min, $max); } sub get_minmax_logfiles { my ($res) = @_; return _get_minmax($res, "$mars/resource-$res/log-*", 0); } sub get_minmax_versions { my ($res) = @_; return _get_minmax($res, "$mars/resource-$res/version-*", 0); } sub get_minmax_any { my ($res) = @_; return _get_minmax($res, "$mars/resource-$res/{log,version}-*", 0); } sub get_minmax_replays { my ($res) = @_; return _get_minmax($res, "$mars/resource-$res/replay-*", 1); } sub try_to_avoid_splitbrain { # NYI } sub get_size { my $arg = shift || ""; if (!($arg =~ m/^([0-9]+(?:\.[0-9]*)?)([kmgtp]?)$/i)) { ldie "size argument '$arg' must be a number, optionally followed by suffix k or m or g or t or p\n"; } my $mod = $2 || ""; $arg = $1; $_ = $mod; SWITCH: { /k/i and $arg *= 1024, last SWITCH; /m/i and $arg *= 1024 * 1024, last SWITCH; /g/i and $arg *= 1024 * 1024 * 1024, last SWITCH; /t/i and $arg *= 1024 * 1024 * 1024 * 1024, last SWITCH; /p/i and $arg *= 1024 * 1024 * 1024 * 1024 * 1024, last SWITCH; } ldie "size '$arg' is not a multiple of 4k\n" if ($arg % 4096) != 0; return $arg; } # # Get actual primary node from links below actual-*/ subdirs # sub _get_actual_primary { my ($res) = @_; my @primary_links = glob("$mars/resource-$res/actual-*/is-primary"); my $primary; foreach my $link (@primary_links) { if (my $val = get_link($link)) { $primary = ($link =~ qr%.*actual-([^/]+)/is-primary%)[0]; last; # Note: if there are more than one 'is-primary' links (an insane state anyway), # the first 'is-primary' link is selected. Other links are ignored. } } return $primary; } sub _get_designated_primary { my ($res) = @_; return get_link("$mars/resource-$res/primary"); } sub get_peers { my ($res) = @_; my @list = glob("$mars/resource-$res/connect-*"); return map { $_ =~ s:$mars/resource-$res/connect-::; $_ } @list; } sub __conv_tv { my ($tv_sec, $tv_nsec) = @_; my ($sec,$min,$hour,$mday,$mon,$year,$wday,$yday,$isdst) = localtime($tv_sec); return sprintf("%04d-%02d-%02d %02d:%02d:%02d.%s", $year+1900, $mon + 1, $mday, $hour, $min, $sec, $tv_nsec); } sub _replace_timestamps { my ($txt) = @_; $txt =~ s:([0-9]{9,99})\.([0-9]{9}):__conv_tv($1,$2):ge; return $txt; } sub _get_text { my ($path, $regex, $do_print) = @_; open(IN, "<", $path) or return ""; my $text = ""; my $count = 0; while (my $line = ) { # use regex e.g. for fetching only errors and warnings if (!$regex || $line =~ $regex) { $line = _replace_timestamps($line); $count++; if ($do_print) { print $line; } else { $text .= $line; } } } close(IN); return $count if $do_print; return $text; } sub get_error_text { my ($cmd, $res) = @_; my $text = _get_text("$mars/resource-$res/logstatus-$host.status", "m/^(err|warn)/i", 0); return $text; } ################################################################## # helpers sub _trigger { system("(echo 2 > /proc/sys/mars/trigger) >/dev/null 2>&1"); } sub _switch { my ($cmd, $res, $path, $on) = @_; my $src = $on ? "1" : "0"; my $old = get_link($path); if ($old && $old eq $src) { lprint "${cmd} on resource $res is already activated\n" if $cmd; return; } set_link($src, $path); lprint "successfully started ${cmd} on resource $res\n" if $cmd; } sub _writable { my ($path, $on) = @_; my $oldmode = (lstat $path)[2] & 0700; my $newmode = $on ? $oldmode | 0200 : $oldmode & ~0200; lprint "chmod '$path' $oldmode $newmode"; chmod($newmode, $path) == 1 or ldie "cannot chmod '$path'\n"; } sub _get_ip { chomp (my @info = `/sbin/ip addr show dev eth0`); foreach (@info) { m#\sinet\s(\d+\.\d+\.\d+\.\d+)# && return $1; } return undef; } # Which primary was active in the past when logfile number $log_nr was created? # In general, this may be even a node which doesn't exist anymore. # $supposed_primary should be normally empty, but may be used # to give a hint and check for consistency of ancient knowledge. sub _get_former_primary { my ($basedir, $log_nr, $supposed_primary) = @_; my $primary = $supposed_primary; foreach my $type ("log", "version") { my $base_path = sprintf("$basedir/$type-%09d-", $log_nr); my $pri_path = "$base_path$supposed_primary"; my $log_path = "$base_path*"; my @names = glob($log_path); if (!@names) { if ($type eq "log" and !$supposed_primary) { next; } ldie "Sorry, there exist no names '$log_path'\n" unless $supposed_primary; } elsif (scalar(@names) == 1) { my $found = $names[0]; if ($supposed_primary) { ldie "Sorry, '$pri_path' does not exist, although '$found' would exist.\n" unless $pri_path eq $found; } else { # assume that the found name is the right one. $pri_path = $found; lprint "found '$pri_path'\n"; ldie "found name is malformed\n" unless $pri_path =~ m:^$base_path(.*):; $primary = $1; } } else { # multiple exist... lprint "There are multiple names with number $log_nr.\n"; my $present = 0; foreach my $file (@names) { lprint " $file\n"; $present++ if $file eq $pri_path; } if ($type eq "log") { lwarn "Usually, this is an indication for split-brain.\n"; lwarn "Be careful!\n"; } ldie "Cannot select between them -- no primary preference given.\n" unless $supposed_primary; ldie "Sorry, '$pri_path' is not among them.\n" unless $present; lprint "=> using '$pri_path' out of them\n"; } return $primary; } ldie "could not determine old primary site for logfile version number $log_nr\n"; } sub _fake_versionlink { my ($basedir, $log_nr, $primary) = @_; $primary = _get_former_primary(@_) unless $primary; my $new_version = sprintf("$basedir/version-%09d-$host", $log_nr); my $pri_version = sprintf("$basedir/version-%09d-$primary", $log_nr); if ($primary eq $host) { lwarn "it makes no sense to fake my own version link '$new_version'\n"; return; } my $pri_link = get_link($pri_version); if (!$pri_link) { # try any one else lwarn "cannot read symlink '$pri_version' -- trying a neighbor link instead\n"; my $try_version = sprintf("$basedir/version-%09d-*", $log_nr); my @test = glob($try_version); my $test_version = shift @test; if ($test_version) { lwarn "trying substitute symlink '$test_version'\n"; my $test_link = get_link($test_version); if ($test_link) { $pri_link = $test_link; lwarn "got value '$pri_link', hopefully this is right\n" if $pri_link; } } } if ($pri_link) { lprint "creating new version symlink '$new_version' -> '$pri_link'\n"; set_link($pri_link, $new_version); } else { lwarn "cannot read symlink '$pri_version' -- cannot create faked versionlink '$pri_version'\n"; } } sub _set_replaylink { my ($basedir, $log_nr) = @_; my $primary = _get_former_primary(@_); my $rep_path = "$basedir/replay-$host"; my $rep_val = sprintf("log-%09d-$primary,0,0", $log_nr); lprint "creating new replaylink '$rep_path' -> '$rep_val'\n"; set_link($rep_val, $rep_path); if ($log_nr > 1) { my $old_primary = ""; my $vers_link = sprintf("$basedir/version-%09d-$primary", $log_nr); my $vers_val = get_link($vers_link); $old_primary = $1 if $vers_val =~ m/:.*,log-[0-9]+-([^,]+),/; _fake_versionlink($basedir, $log_nr - 1, $old_primary); } else { my $initial; for (;;) { $initial = get_link("$basedir/version-000000001-$primary", 1); last if $initial; sleep_timeout(); } set_link($initial, "$basedir/version-000000001-$host"); } } ################################################################## # commands sub ignore_cmd { my ($cmd, $res) = @_; lprint "ignoring command '$cmd' on resource '$res'\n"; } sub senseless_cmd { my ($cmd, $res) = @_; lprint "command '$cmd' makes no sense with MARS (ignoring on resource '$res')\n"; } sub forbidden_cmd { my ($cmd, $res) = @_; ldie "command '$cmd' on resource '$res' cannot be used with MARS (migth affect too many hosts, lead to undesired consequences)\n"; } sub nyi_cmd { my ($cmd, $res) = @_; ldie "command '$cmd' on resource '$res' is not yet implemented\n"; } sub is_module_loaded { open TEST, "lsmod | grep mars |"; my $res = ; close TEST; return $res; } sub set_link_cmd { my $cmd = shift; for (;;) { my $src = shift || last; my $dst = shift || ldie "you did not supply a symlink destination for source '$src'\n"; ldie "symlink target '$dst' is not an absolute path\n" unless $dst =~ m:^/:; check_userspace($dst); my $dir = `dirname "$dst"` or ldie "path '$dst' has no dirname\n"; chomp $dir; ldie "directory '$dir' does not exist\n" unless -d $dir; set_link($src, $dst); } } sub _create_cluster { my ($cmd) = @_; system("mkdir $mars") unless -d $mars; my $old_uuid = get_link("$mars/uuid", 2); if ($cmd eq "create-cluster") { ldie "cluster was already created with uuid='$old_uuid'\n" if $old_uuid && !$force; my $uuid = `echo -n \$(hostname) \$(date)`; set_link($uuid, "$mars/uuid"); finish_links(); # opportunity for errors => don't continue } elsif (!$old_uuid && !$force) { if ($user_version == 0.1) { my $uuid = `echo -n \$(hostname) \$(date)`; set_link($uuid, "$mars/uuid"); } ldie "cluster has no uuid\n" if $user_version > 0.1; } system("mkdir $mars/ips") unless -d "$mars/ips"; system("mkdir $mars/userspace") unless -d "$mars/userspace"; system("mkdir $mars/defaults") unless -d "$mars/defaults"; system("mkdir $mars/defaults-$host") unless -d "$mars/defaults-$host"; system("mkdir $mars/todo-global") unless -d "$mars/todo-global"; set_link($ip, "$mars/ips/ip-$host"); set_link("1", "$mars/todo-global/deleted-$host"); } sub create_cluster { my ($cmd, $peer) = @_; ldie "cluster is already created\n" if !$force && -d "$mars/ips"; ldie "mars module is loaded, please unload first\n" if is_module_loaded(); _create_cluster(@_); } sub join_cluster { my ($cmd, $peer) = @_; if (glob("$mars/resource-*") or glob("$mars/ips/*")) { ldie "Sorry, some resources already exist!\nThis is dangerous!\nIf you are sure that no resource clash is possible, re-invoke this command with '--force' option\n" unless $force; } ldie "mars module is loaded, please unload first\n" if is_module_loaded(); lprint "joining cluster via rsync (peer='$peer')\n"; # check connection system("ssh $peer uname -a") == 0 or ldie "oops, no connection to $peer ...\n"; mkdir($mars) unless -d $mars; system("rsync --recursive --links --max-size=1 -v $peer:$mars/ $mars/") == 0 or ldie "cannot get remote symlink tree via rsync\n"; _create_cluster(@_); finish_links(); system("rsync --recursive --links -v $mars/ips/ $peer:$mars/ips/") == 0 or ldie "oops\n"; } sub leave_cluster { my ($cmd) = @_; my $check = "/mars/resource-*/*-$host"; ldie "I am member of some resources\n" if glob($check) && !$force; _create_delete("$mars/ips/$host"); } sub create_res { my ($cmd, $res, $dev, $appear, $size_arg) = @_; my $create = ($cmd eq "create-resource"); ldie "undefined device or size argument\n" unless $dev; $appear = $res if !$appear; check_id($appear) if $appear; my $resdir = "$mars/resource-$res"; if ($create) { ldie "resource directory '$res' already exists; you may override this via --force\n" if (!$force && -d $resdir); ldie "resource directory '$res' has some contents -- remove by hand if you are sure that you really know what you are doing, or give --force.\n" if (!$force && glob("$resdir/*.status")); lprint "creating new resource '$res'\n"; } else { if ( -e "$resdir/connect-$host" || -e "$resdir/data-$host") { lwarn "resource '$res' has been already joined -- this is dangerous!\n"; ldie "refusing dangerous operation\n" unless $force; } else { lprint "joining to existing resource '$res'\n"; } } my $size = 0; if (-b $dev) { ldie "block device '$dev' must be an absolute path starting with '/'\n" unless $dev =~ m/^\//; use Fcntl 'SEEK_END', 'O_RDONLY', 'O_RDWR', 'O_EXCL'; my $flags = O_RDWR | O_EXCL; if ($force) { $flags = O_RDONLY; } sysopen(TEST, $dev, $flags) or ldie "cannot open device '$dev' for exclusive rw access\n"; $size = sysseek(TEST, 0, SEEK_END); close(TEST); lprint "block device '$dev': determined size = $size bytes\n"; if ($size_arg) { my $new_size = get_size($size_arg); ldie "size argument '$size_arg' is smaller than device size '$size'\n" unless $new_size <= $size; lprint "reducing size from $size to $new_size\n"; $size = $new_size; } } else { $size = get_size($dev); if ($size > 0) { $dev = ""; } else { ldie "bad parameter '$dev': must be either a block device name, or size followed by k or m or g or t\n"; } } ldie "implausible size $size" unless $size > 4096 * 16; # smaller floppies should not exist ;) my $primary; my $replay_nr = -1; if ($create) { mkdir($resdir); ldie "could not create resource '$res'\n" unless -d $resdir; set_link($size, "$resdir/size"); } else { # join ldie "resource '$res' does not exist\n" unless -d $resdir; my $res_size = get_link("$mars/resource-$res/size"); if ($size < $res_size) { lwarn "size of new device is only $size, but should be $res_size\n"; ldie "refusing to join due to bad size\n" unless $force; } elsif ($size > $res_size) { lprint "Your physical device has size $size, which is larger than the logical resource size $res_size.\n"; lprint "This does no harm, but you are wasting some space.\n"; } $primary = _get_designated_primary($res); ldie "implausible state: I ($host) am already designated primary of resource '$res' which I just wanted to join\n" if $primary eq $host; if ($primary eq "(none)") { my @list = glob("$resdir/replay-*") or ldie "cannot find any candidate for primary\n"; my $first = pop @list or ldie "bad glob list\n"; $primary = get_link($first); $primary =~ s/^log-[0-9]+-(.*),.*,.*$/$1/; lprint "using '$primary' as primary\n"; } ldie "my ip '$ip' is not registered -- please run 'join-cluster' first\n" unless -l "$mars/ips/ip-$host"; my $replay = get_link("$resdir/replay-$primary"); if ($replay =~ m/^log-([0-9]+)-/) { $replay_nr = $1; } else { ldie "cannot determine current logfile number.\n"; } } # check for uniqeness of $appear if ($appear) { foreach my $old_dev (glob("$mars/resource-*/device-$host")) { $old_dev =~ m:/resource-([^/]+)/:; next unless defined($1); my $old_res = $1; next if $old_res eq $res; my $old_name = get_link($old_dev); if ($old_name eq $appear) { if ( -e "$mars/resource-$old_res/data-$host") { ldie "device '/dev/mars/$old_name' is already present in joined resource '$old_res'\n"; } else { lwarn "device '/dev/mars/$old_name' is already present in another unjoined resource '$old_res' -- this does no harm, but may be confusing.\n"; } } } # warn if devices are named differently throughout the cluster foreach my $old_dev (glob("$resdir/device-*")) { my $old_name = get_link($old_dev); if ($old_name ne $appear) { $old_dev =~ m:/device-(.+)$:; my $old_host = $1; lwarn "your name '/dev/mars/$appear' differs from '/dev/mars/$old_name' on host '$old_host'."; lwarn "this does no harm, but may be confusing."; } } } my $file = "$resdir/data-$host"; if (!$dev) { lwarn "file '$file' already exists - reusing\n" if -e $file; lprint "setup sparse file '$file' with size $size\n"; open(OUT, ">>", $file) or ldie "could not open '$file'\n"; truncate(OUT, $size) or ldie "truncate to size $size failed\n"; close OUT; } else { lprint "using existing device '$dev'\n"; set_link($dev, $file); } if ($appear) { lprint "resource '$res' will appear as local device '/dev/mars/$appear'\n"; set_link($appear, "$resdir/device-$host"); } mkdir("$resdir/userspace") unless -d "$resdir/userspace"; mkdir("$resdir/defaults") unless -d "$resdir/defaults"; mkdir("$resdir/defaults-$host"); mkdir("$resdir/local-$host"); mkdir("$resdir/actual-$host"); my $todo = "$resdir/todo-$host"; mkdir($todo); set_link("1", "$todo/attach"); set_link("1", "$todo/connect"); set_link("1", "$todo/sync"); set_link("1", "$todo/allow-replay"); unlink("$resdir/syncstatus-$host"); if ($create) { set_link($host, "$resdir/primary"); set_link($size, "$resdir/syncstatus-$host"); set_link("log-000000001-$host,0,0", "$resdir/replay-$host"); system("touch $resdir/log-000000001-$host"); set_link("00000000000000000000000000000000,log-000000001-$host,0:", "$resdir/version-000000001-$host"); finish_links(); lprint "successfully created resource '$res'\n"; } else { _set_replaylink($resdir, $replay_nr, $primary); set_link("0", "$resdir/syncstatus-$host"); set_link($primary, "$resdir/connect-$host"); set_link($host, "$resdir/connect-$primary") unless -l "$resdir/connect-$primary"; finish_links(); lprint "successfully joined resource '$res'\n"; } } sub leave_res_phase0 { my ($cmd, $res) = @_; check_not_primary(@_); foreach my $tmp (glob("$mars/resource-$res/todo-$host/*")) { my $status = get_link($tmp, 2); ldie "switch '$tmp' is not off\n" if $status; } check_status($cmd, $res, "is-attached", 0, 0, 1); if (!$force) { foreach my $tmp (glob("$mars/resource-$res/actual-$host/{is-,logfile-}*")) { my $status = get_link($tmp); ldie "running status '$tmp' is not off\n" if $status; } } } sub leave_res_phase1 { my ($cmd, $res) = @_; my $peerlink = "$mars/resource-$res/connect-$host"; my $peer = get_link($peerlink, 1); foreach my $tmp (glob("$mars/resource-$res/connect-*")) { next if $tmp eq $peerlink; my $target = get_link($tmp); next unless $target eq $host; lprint "changing '$tmp' from '$host' to '$peer'\n"; set_link($peer, $tmp); } _create_delete($peerlink); finish_links(); # opportunity for errors => don't continue _create_delete("$mars/resource-$res/replay-$host"); _create_delete("$mars/resource-$res/data-$host"); _create_delete("$mars/resource-$res/syncstatus-$host"); _create_delete("$mars/resource-$res/device-$host"); _create_delete("$mars/resource-$res/actsize-$host"); foreach my $dir (glob("$mars/resource-$res/*-$host/")) { foreach my $tmp (glob("${dir}*")) { _create_delete($tmp); } _create_delete($dir); } cleanup_res($cmd, $res, 0); } sub logrotate_res { my ($cmd, $res) = @_; check_primary(@_); my @paths = glob("$mars/resource-$res/log-*-$host") or ldie "cannot find any logfiles\n"; @paths = sort(@paths); my $last = pop(@paths); if (-z $last) { lprint "an empty logfile '$last' already exists, nothing to do.\n"; return; } my $nr = $last; $nr =~ s/^.*log-([0-9]+)-.+$/$1/; my $next = sprintf("$mars/resource-$res/log-%09d-$host", $nr + 1); ldie "logfile '$next' already exists\n" if -e $next; system("touch $next"); } sub _get_deletable_logfiles { my ($cmd, $res) = @_; my $min = -1; my $max = -1; my @log_paths = glob("$mars/resource-$res/log-*") or ldie "cannot find any logfiles\n"; foreach my $path (@log_paths) { $path =~ m/\/log-([0-9]+)-/; my $nr = $1; $min = $nr if ($nr < $min || $min < 0); $max = $nr if ($nr > $max || $max < 0); } my @paths = glob("$mars/resource-$res/replay-*") or ldie "cannot find any replay symlinks\n"; foreach my $path (@paths) { my $target = get_link($path); $target =~ m/^log-([0-9]+)/; my $nr = $1; $max = $nr if ($nr < $max || $max < 0); } lprint "min deletable logfile number: $min\n"; lprint "min non-deletable logfile number: $max\n"; return ($min, $max); } my $delete_nr = -1; sub _create_delete { my ($target) = @_; ldie "cannot delete: '$target' is no absolute path\n" unless $target =~ m:^/:; if ($delete_nr < 0) { # compute only upon first call my @paths = glob("$mars/todo-global/delete-*"); foreach my $path (@paths) { $path =~ m/-([0-9]+)/; if (defined($1) && $1 > $delete_nr) { $delete_nr = $1; } } my @paths2 = glob("$mars/todo-global/deleted-*"); foreach my $path (@paths2) { my $link = get_link($path, 1); $link =~ m/([0-9]+)/; if (defined($1) && $1 > $delete_nr) { $delete_nr = $1; } } } my $new = sprintf("$mars/todo-global/delete-%09d", ++$delete_nr); lprint "create symlink $new -> $target\n"; set_link($target, $new); } sub delete_file_cmd { my $cmd = shift; foreach my $path (@_) { check_userspace($path); _create_delete($path); } } sub logdelete_res { my ($cmd, $res) = @_; my @paths = glob("$mars/resource-$res/log-*") or ldie "cannot find any logfiles\n"; @paths = sort(@paths); my ($min_deletable, $max_deletable) = _get_deletable_logfiles(@_); if ($min_deletable >= $max_deletable) { lprint "no logfiles are deletable.\n"; return; } if ($cmd ne "log-delete-all") { $max_deletable = $min_deletable + 1; # delete only the first one } my $nr = 0; for (;;) { my $first = shift(@paths); last unless $first; $nr = $first; $nr =~ s/^.*log-([0-9]+)-.+$/$1/; next unless $nr < $max_deletable; lprint "chosen '$first' for deletion\n"; _create_delete($first); } lprint "removing left-over version symlinks...\n"; foreach my $versionlink (glob("$mars/resource-$res/version-*")) { my $nrv = $versionlink; $nrv =~ s/^.*\/version-([0-9]+)-.+$/$1/; # we need at least one more version link than logfiles for consistency checks next unless $nrv < $max_deletable - 1; _create_delete($versionlink); } } sub attach_res_phase0 { my ($cmd, $res) = @_; return if $force; my $detach = ($cmd eq "detach"); if ($detach) { my $device_in_use = get_link("$mars/resource-$res/actual-$host/open-count", 1); if ($device_in_use) { my $name = get_link("$mars/resource-$res/device-$host"); my $dev = "/dev/mars/$name"; ldie "device '$dev' is in use\n"; } } } sub attach_res_phase1 { my ($cmd, $res) = @_; my $detach = ($cmd eq "detach"); my $path = "$mars/resource-$res/todo-$host/attach"; _switch($cmd, $res, $path, !$detach); } sub attach_res_phase2 { my ($cmd, $res) = @_; my $detach = ($cmd eq "detach"); check_status($cmd, $res, "is-attached", $detach ? 0 : 1, 1); if ($detach) { check_mars_device($cmd, $res, 1, 1); } } sub connect_global_res { my ($cmd, $res) = @_; my $disconnect = ($cmd =~ m/disconnect/); my @paths = glob("$mars/resource-$res/todo-*/"); for my $path (@paths) { _switch($cmd, $res, "$path/connect", !$disconnect); } } sub connect_local_res { my ($cmd, $res) = @_; my $disconnect = ($cmd =~ m/disconnect/); my $path = "$mars/resource-$res/todo-$host/connect"; _switch($cmd, $res, $path, !$disconnect); } sub pause_sync_global_res { my ($cmd, $res) = @_; my $pause = ($cmd =~ m/pause/); my @paths = glob("$mars/resource-$res/todo-*/"); for my $path (@paths) { _switch($cmd, $res, "$path/sync", !$pause); } } sub pause_sync_local_res { my ($cmd, $res) = @_; my $pause = ($cmd =~ m/pause/); my $path = "$mars/resource-$res/todo-$host/sync"; _switch($cmd, $res, $path, !$pause); } sub pause_replay_global_res { my ($cmd, $res) = @_; my $pause = ($cmd =~ m/pause/); my @paths = glob("$mars/resource-$res/todo-*/"); for my $path (@paths) { _switch($cmd, $res, "$path/allow-replay", !$pause); } } sub pause_replay_local_res { my ($cmd, $res) = @_; my $pause = ($cmd =~ m/pause/); my $path = "$mars/resource-$res/todo-$host/allow-replay"; _switch($cmd, $res, $path, !$pause); } sub up_res_phase0 { my ($cmd, $res) = @_; my $down = ($cmd eq "down"); if ($down) { attach_res_phase0("detach", $res); } else { attach_res_phase0("attach", $res); } } sub up_res_phase1 { my ($cmd, $res) = @_; my $down = ($cmd eq "down"); if ($down) { pause_replay_local_res("pause-replay-local", $res); pause_sync_local_res("pause-sync-local", $res); connect_local_res("disconnect", $res); attach_res_phase1("detach", $res); } else { attach_res_phase1("attach", $res); connect_local_res("connect", $res); pause_sync_local_res("resume-sync-local", $res); pause_replay_local_res("resume-replay-local", $res); } } sub up_res_phase2 { my ($cmd, $res) = @_; my $down = ($cmd eq "down"); if ($down) { attach_res_phase2("detach", $res); } else { attach_res_phase2("attach", $res); } } sub set_replay_res { my ($cmd, $res, $new_nr) = @_; if (!$new_nr || $new_nr <= 0) { ldie "you must supply a numeric logfile number as third argument.\n"; } check_not_primary(@_); check_todo($cmd, $res, "allow-replay", 0, 0); my $replaylink = "$mars/resource-$res/replay-$host"; my $old_val = get_link($replaylink); my $old_nr = $old_val; $old_nr =~ s/log-([0-9]+)-.*/$1/; ldie "old log number '$old_nr' is wrong\n" unless $old_nr > 0; if ($new_nr > $old_nr) { lwarn "you try to skip logfile numbers from $old_nr to $new_nr, are you sure?\n"; ldie "you would need --force if you really know what you are doing.\n" unless $force; } _set_replaylink("$mars/resource-$res", $new_nr, ""); set_link("$new_nr", "$mars/resource-$res/skip-check-$host"); } sub fake_local_res { my ($cmd, $res) = @_; my $path = "$mars/resource-$res/todo-$host/sync"; _switch($cmd, $res, $path, 0); #check_status($res, "copy-syncstatus-$host", 0); my $size = get_link("$mars/resource-$res/size"); my $target = "$mars/resource-$res/syncstatus-$host"; set_link($size, $target); } sub _primary_res { my ($res, $new, $old) = @_; my $pri = "$mars/resource-$res/primary"; set_link($new, $pri); lprint "designated primary changed from '$old' to '$new'\n"; } # check whether primary/secondary switching is possible at all sub primary_phase0 { my ($cmd, $res) = @_; ldie "cannot switch primary: mars kernel module is not loaded\n" unless ($cmd eq "secondary" || -d "/proc/sys/mars"); if ($cmd eq "primary" and !$force) { check_sync_finished($res, $host); check_todo($cmd, $res, "attach", 1, 0); check_todo($cmd, $res, "connect", 1, 0); check_todo($cmd, $res, "allow-replay", 1, 0); #check_status($cmd, $res, "replay_rate", 0, 0, 1); } my $old = _get_designated_primary($res); return if ($old eq $host and $cmd eq "primary"); return if $old eq "(none)"; my $device_in_use = get_link("$mars/resource-$res/actual-$old/open-count", 1); if ($device_in_use) { my $name = get_link("$mars/resource-$res/device-$old", 1) || "unknown"; lwarn "device '/dev/mars/$name' for resource '$res' is $device_in_use times in use on primary host '$old'\n"; ldie "first you must umount/close the device (on host '$old')\n" unless $force; } lprint "all preconditions OK for resource '$res'\n"; } # when necessary, switch to secondary (intermediately) sub primary_phase1 { my ($cmd, $res) = @_; return if ($force and $cmd eq "primary"); my $old = _get_designated_primary($res); return if ($old eq $host and $cmd eq "primary"); return if ($old ne $host and $cmd eq "secondary"); my $new = "(none)"; return if $old eq $new; _primary_res($res, $new, $old); } # when necessary, wait sub primary_phase2 { my ($cmd, $res) = @_; return if $force; return unless $cmd eq "primary"; check_primary_gone($res); try_to_avoid_splitbrain(@_); } # when necessary, switch to primary sub primary_phase3 { my ($cmd, $res) = @_; return unless $cmd eq "primary"; my $old = _get_designated_primary($res); my $new = $host; return if $old eq $new; _primary_res($res, $new, $old); } # wait for device to appear / disappear sub primary_phase4 { my ($cmd, $res) = @_; return if $force; if($cmd eq "secondary") { check_mars_device($cmd, $res, 1, 1); return; } check_mars_device($cmd, $res, 1, 0); } sub wait_umount_res { my ($cmd, $res) = @_; while (1) { my $sum = 0; foreach my $path (glob("$mars/resource-$res/actual-*/open-count")) { $sum += get_link($path); } last if !$sum; lprint "device for resource '$res' is $sum times in use somewhere\n"; sleep_timeout(3); } lprint "OK, device for resource '$res' is not in use.\n"; } sub cleanup_res { my ($cmd, $res, $replay_nr) = @_; my @logfiles = glob("$mars/resource-$res/log-*-$host"); my @versions = glob("$mars/resource-$res/version-*-$host"); foreach my $path (@logfiles) { $path =~ m:/log-([0-9]+)-: or next; my $nr = $1; next if $nr >= $replay_nr; _create_delete($path); } foreach my $path (@logfiles, @versions) { $path =~ m:/version-([0-9]+)-: or next; my $nr = $1; next if $nr >= $replay_nr - 1; _create_delete($path); } } sub invalidate_res_phase0 { my ($cmd, $res) = @_; check_not_primary(@_); } sub invalidate_res_phase1 { my ($cmd, $res) = @_; _switch($cmd, $res, "$mars/resource-$res/todo-$host/sync", 0); _switch($cmd, $res, "$mars/resource-$res/todo-$host/allow-replay", 0); } sub invalidate_res_phase2 { my ($cmd, $res) = @_; if (!$force) { check_status($cmd, $res, "is-syncing", 0, 1); check_status($cmd, $res, "is-replaying", 0, 1); } } sub invalidate_res_phase3 { my ($cmd, $res) = @_; my $dst = "$mars/resource-$res/syncstatus-$host"; my $primary = _get_designated_primary($res); my $replay = get_link("$mars/resource-$res/replay-$primary"); $replay =~ m/^log-([0-9]+)-/ or ldie "replay link '$replay' is not parsable\n"; my $replay_nr = $1; set_link("0", $dst); finish_links(); # opportunity for errors => don't continue set_link($replay, "$mars/resource-$res/replay-$host"); _switch($cmd, $res, "$mars/resource-$res/todo-$host/allow-replay", 1); _switch($cmd, $res, "$mars/resource-$res/todo-$host/sync", 1); } sub resize_res { my ($cmd, $res, $size_arg) = @_; my $new_size = 0; if ($size_arg) { $new_size = get_size($size_arg); ldie "optional size argument '$new_size' must be numeric and positive\n" unless scalar($new_size) > 0; lprint "new size: $new_size bytes\n"; } check_primary(@_); my @actsizes = glob("$mars/resource-$res/actsize-*"); ldie "resource $res has no actsize-* symlinks\n" unless @actsizes; my $lnk = "$mars/resource-$res/size"; my $old_size = get_link($lnk); my $min_size = 0; foreach my $actsize (@actsizes) { my $this_size = get_link($actsize); if (!$min_size || $this_size < $min_size) { $min_size = $this_size; } } lprint "old_size=$old_size\n"; lprint "min_size=$min_size\n"; $new_size = $min_size if !$new_size; lprint "new_size=$new_size\n"; ldie "new size $new_size is higher than the minimum size of all volumes $min_size" if $new_size > $min_size; # no override with --force possible # for now, disallow decreasing until some bugs are fixed ldie "only increases of the size are possible!\n" if $new_size < $old_size; ldie "only increases of the size are possible without --force\n" if $new_size <= $old_size && !$force; foreach my $switch (glob("$mars/resource-$res/todo-*/sync")) { my $this_switch = get_link($switch); ldie "sync on '$switch' is switched on -- use marsadm pause-sync to stop\n" unless !$this_switch; } my @syncsizes = glob("$mars/resource-$res/syncstatus-$host"); foreach my $syncsize (@syncsizes) { my $this_size = get_link($syncsize); ldie "sync on $syncsize has not yet finished: $this_size != $old_size (DANGEROUS FIX: if you know what you are doing, marsadm fake-sync can 'fix' it -- but this may need a full-sync afterwards)\n" unless $this_size == $old_size; } foreach my $syncsize (@syncsizes) { my $this_size = get_link($syncsize); set_link($new_size, $syncsize); } set_link($new_size, $lnk); } sub role_cmd { my ($cmd, $res) = @_; my $primary = _get_actual_primary($res) || '(none)'; my $todo_primary = _get_designated_primary($res); my $msg = "I am actually "; $msg .= ($primary eq $host) ? "primary" : "secondary"; if ($primary eq $todo_primary) { $msg .= " and $primary is primary" if ($primary ne $host); } elsif ($primary ne $todo_primary) { $todo_primary = "I" if ($todo_primary eq $host); $msg .= " and $todo_primary should be primary"; } lprint $msg . "\n"; } sub mars_state_cmd { my ($cmd, $res) = @_; my $primary = _get_actual_primary($res) || '(none)'; my $todo_primary = _get_designated_primary($res); if ($primary eq $host) { lprint "is_primary\n"; return; } elsif ($todo_primary eq $host) { lprint "becoming_primary\n"; return; } # secondary without ambitions to become primary my $size = get_link("$mars/resource-$res/size"); my $syncstatus = get_link("$mars/resource-$res/syncstatus-$host"); if ($syncstatus != $size) { lprint "secondary inconsistent ($syncstatus bytes of $size)\n"; return; } if ($primary eq "(none)") { my $min = 0; foreach my $path (glob("$mars/resource-$res/log-*")) { my $nr = $path; $nr =~ s:^.*[a-z]+-([0-9]+)(-[^/]*)?$:$1:; if ($nr > $min) { $primary = $path; $primary =~ s:^.*/[a-z]+-[0-9]+-([^/]*)$:$1:; $min = $nr; } } } my $primary_replay = get_link("$mars/resource-$res/replay-$primary"); my $host_replay = get_link("$mars/resource-$res/replay-$host"); if ($primary_replay eq $host_replay) { lprint "secondary uptodate\n"; return; } lprint "secondary outdated ($host_replay instead of $primary_replay)\n"; } sub cat_cmd { my $cmd = shift; foreach my $path (@_) { _get_text($path, undef, 1); } } sub mars_info_cmd { my ($cmd, $res) = @_; my $info = "$mars/resource-$res/logstatus-$host.status"; cat_cmd($cmd, $info); } sub show_cmd { my ($cmd, $res) = @_; $res = "*" if !$res || $res eq "all"; my $glob = "$mars/{ips/ip-$host,alive-$host,emergency-$host,rest-space-$host,resource-$res/{device,primary,size,actsize-$host,syncstatus-$host,replay-$host,actual-$host/*,todo-$host/*}}"; foreach my $link (glob($glob)) { next unless -l $link; my $res = get_link($link); my $short = $link; $short =~ s:^$mars/::; lprint "$short=$res\n"; } } sub show_errors_cmd { my ($cmd, $res) = @_; my $text = get_error_text(@_); if ($text) { lprint $text; ldie "resource $res has some (old) problems.\n"; } else { lprint "no errors/warnings are reported.\n"; } } sub helplist { my $temp; $temp = shift; lprint "ERROR: $temp" if ($temp); lprint " marsadm [] [] [