#!/usr/bin/perl -w # # MARS Long Distance Replication Software # # This file is part of MARS project: http://schoebel.github.io/mars/ # # Copyright (C) 2010-2014 Thomas Schoebel-Theuer # Copyright (C) 2011-2014 1&1 Internet AG # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation; either version 2 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License along # with this program; if not, write to the Free Software Foundation, Inc., # 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. use strict; use English; use warnings; umask 0077; # regexes for scanning of reserved terminal symbol classes use v5.10; my $reserved_names = "none|all|any|local|localhost"; my $match_reserved_substr = qr"\(?($reserved_names)\)?"; my $match_reserved_id = qr{^$match_reserved_substr$}; ################################################################## # global defaults my $parallel = -999; my $single_step = 0; my $inject_phase = 0; my $compat_deletions = 1; my $compat_alivelinks = 1; my $threshold = 10 * 1024 * 1024; my $window = 60; my $keep_backup_hours = 24 * 7; my $verbose = 0; my $max_deletions = 512; my $thresh_logfiles = 10; my $thresh_logsize = 5; # GB my $dry_run = 0; # All paths should be overridable from outside my $etc_marsadm = $ENV{ETC_MARSADM} ? $ENV{ETC_MARSADM} : "/etc/marsadm"; my @MARS_PATH = $ENV{MARS_PATH} ? split(/:/, $ENV{MARS_PATH}) : ( ".", defined($ENV{HOME}) ? "$ENV{HOME}/.marsadm" : "", $etc_marsadm, "/usr/lib/marsadm", "/usr/local/lib/marsadm", ); ################################################################## # messaging my %skip_res; my $error_count = 0; my $notify = ""; my $child_prefix = ""; my $logger = "/usr/bin/logger"; sub llog { my ($text) = @_; if ($notify) { $text =~ s/"/\\"/g; system("$logger -t marsadm \"$notify $text\""); } } sub lprint { my ($text) = @_; $OUTPUT_AUTOFLUSH = 1; print $child_prefix . $text; llog($text); } sub lprint_stderr { my ($text) = @_; $OUTPUT_AUTOFLUSH = 1; print STDERR $child_prefix . $text; llog($text); } sub lskip { my ($res, $text) = @_; # Already set %skip_res here, so it doesn't count as error $skip_res{$res} = 1; lprint_stderr "SKIPPING: $text"; # trigger an exception die "\n"; } sub ldie { my ($text) = @_; if ($verbose > 2) { my $i = 1; for (;;) { my ($package, $filename, $line, $subroutine, $hasargs, $wantarray, $evaltext, $is_require, $hints, $bitmask, $hinthash) = caller($i++); last unless defined($subroutine); lprint_stderr "$line:$subroutine\n"; } } $error_count++; lprint_stderr "DYING: $text"; llog("DYING: $text"); die "\n"; } sub lwarn { my ($text) = @_; lprint_stderr "WARNING: $text"; llog("WARNING: $text"); } sub lhint { my ($text) = @_; lprint_stderr " HINT: $text"; llog("HINT: $text"); } ################################################################## # basic helpers sub raw_glob { my $expr = shift; my @raw_glob = glob($expr); return @raw_glob; } sub lamport_glob { my $expr = shift; my @result =(); my @raw_glob = glob($expr); foreach my $path (@raw_glob) { my $val = readlink($path); next if (defined($val) && $val eq ".deleted"); push @result, $path; } return @result; } sub safe_creat { my $path = shift; if (-l $path) { my $val = readlink($path); unlink($path) if $val eq ".deleted"; } my $fh = undef; use IO::Handle; use Fcntl; my $status = sysopen($fh, $path, O_CREAT | O_EXCL | O_NOFOLLOW | O_RDONLY); close($fh); return (defined($status) && $status); } sub safe_touch { my ($path, $stamp) = @_; # Workaround for non-implemented undef parameter at utime() # [should be implemented beyond perl 5.8.0 according to "man perlfunc", # but seems to not work as documented.] $stamp = time() unless defined($stamp); use IO::Handle; if (-l $path) { my $val = readlink($path); return 0 if $val eq ".deleted"; # Perl doesn't seem to support AT_SYMLINK_NOFOLLOW my $opt = $stamp ? "-d \"\@$stamp\"" : ""; my $status = system("touch -h $opt \"$path\""); # additional systemd-trigger when relevant if ($path =~ m:/(primary|systemd):p) { my $trig_path = "$PREMATCH/systemd-trigger"; unless ($trig_path =~ m:/userspace/:) { utime($stamp, $stamp, $trig_path) || safe_creat($trig_path); } } return $status; } my $status = utime($stamp, $stamp, $path); return $status; } sub link_exists { my $path = shift; return 0 unless -l $path; my $val = readlink($path); return 0 unless defined($val); return 0 if $val eq ".deleted"; return 1; } sub file_exists { my $path = shift; if (-l $path) { my $val = readlink($path); return 0 if $val eq ".deleted"; } return 1 if -f $path; return 0; } sub any_exists { my $path = shift; return 1 if link_exists($path); return 1 if file_exists($path); return 0; } ################################################################## # global variables my $Id = '$Id$ '; my $user_version = 0.1; my $marsadm_version = 2.9; # some rough hint at newer features my $mars = "/mars"; my $host = `uname -n` or ldie "cannot determine my network node name\n"; chomp $host; check_id($host); my $real_host = $host; my $backup_dir = "$mars/backups-" . time(); my $force = 0; my $ignore_sync = 0; my $ignore_deleted_peers = 1; # 0 = off, 1 = only for certain reports, 2 = BEWARE: ONLY FOR EXPERTS my $cron_mode = 0; my $cron_autoclean_days = 0; my $timeout = 600; my $phase_nr = 0; my $ssh_port = 22; my $ssh_opts = "-A -o StrictHostKeyChecking=no -o ConnectTimeout=5"; my $ssh_probe = "uname -a"; my $rsync_opts = "-av --exclude='.deleted-*'"; my $systemd_enabled = 1; my $kernel_version = 0; my $kernel_features_version = -1; my $kernel_strategy_version = -1; my $kernel_flags_version = ~0x0; my %known_ips; ################################################################## # general helpers my $allow_fail_action = undef; # Failure compensation. # This is called when do_one_res() fails sub fail_action { my ($cmd, $res) = @_; lprint "FAILURE cmd='$cmd' res='$res'\n"; $error_count++; return unless $allow_fail_action; lprint "===== BEGIN FAILURE COMPENSATION cmd='$cmd' res='$res'\n"; # prevent recursive ldie eval { &$allow_fail_action(@_); my $sub_status = $?; lprint "COMPENSATION sub_status='$sub_status'\n"; 1; } or ( lwarn "DOUBLE FAILURE detected.\n" ); lprint "===== END FAILURE COMPENSATION cmd='$cmd' res='$res'\n"; } sub _device_name { my ($res, $peer) = @_; $peer = $host unless defined($peer); my $name = get_link("$mars/resource-$res/device-$peer", 1); $name = $res if (!$name || $name eq "(none)"); return $name; } sub device_name { my $name = _device_name(@_); $name = "/dev/mars/$name" if (defined($name) && $name); return $name; } sub device_exists { my ($res, $peer) = @_; $peer = $host unless defined($peer); return 0 if $peer eq "(none)"; if ($peer eq $real_host) { # Silent fallback to local detection for old kernel module versions my $buildtag = get_alive_link("buildtag", $peer, 1); if (!$buildtag) { # VERY old MARS modules dont report their version $buildtag = `cut -d' ' -f1 < /proc/sys/mars/version`; # Sometimes "never touch a running system" is a BAD strategy... lwarn "Please upgrade your EXTREMELY OLD module version '$buildtag'\n" if $buildtag; } if (defined($buildtag) && $buildtag =~ m/([0-9]+)($|\()/ && $1 <= 97) { my $name = device_name($res, $peer); my $val = (-b $name) ? 1 : 0; return $val; } } my $lnk = "$mars/resource-$res/actual-$peer/if-on"; my $val = get_link($lnk, 2); # backwards compatibility to old kernels my $lnk_old = "$mars/resource-$res/actual-$peer/device-$peer"; my $val_old = get_link($lnk_old, 2); if (defined($val_old) && $val_old ne "" && (!defined($val) || $val eq "" || get_link_stamp($lnk_old) > get_link_stamp($lnk))) { $val = $val_old; } if (!defined($val) || $val eq "") { # Fallback to local device my $name = device_name($res, $peer); if ($peer eq $real_host) { lwarn "Unexpected fallback to local device detection $name for $peer\n"; $val = (-b $name) ? 1 : 0; my $pre_txt = "Unexpected fallback result: device '$name' is"; if ($val) { my $ls = `ls -ld $name`; lwarn "$pre_txt present as $ls\n"; } else { lwarn "$pre_txt NOT locally present\n"; } } else { lwarn "Cannot determine device presence for $peer\n"; } } return $val; } ################################################################## # ssh helpers # deprecated, to disappear in a future release my %ssh_ips; my %ssh_ports; sub make_ssh_cmd { my ($peer, $no_login) = @_; return "" unless $ssh_port; my $ssh = "ssh $ssh_opts"; if (!defined($ssh_ports{$peer})) { my $port; my $real_peer; my $peer_ip = get_link("$mars/ips/ip-$peer", 2); # workaround firewall hell at some installations... RETRY: for my $this_peer ($peer, $peer_ip) { next unless defined($this_peer); next unless $this_peer; # check whether machine is reachable if (system("ping -c1 $this_peer")) { lwarn "cannot ping '$this_peer'\n"; next; } # first try given parameters, then port 22, then ssh_config defaults my %seen = (); foreach my $try_port ($ssh_port, 22, 0) { next if defined($seen{$try_port}); $seen{$try_port} = 1; if ($try_port && system("if which nc; then nc -v -4 -w 5 \"$this_peer\" $try_port < /dev/null > /dev/null; fi")) { lwarn "SSH port '$try_port' for '$this_peer' is not reachable\n"; next; } my $opt_port = $try_port ? "-p $try_port" : ""; if (!system("$ssh $opt_port root\@$this_peer \"$ssh_probe\"")) { ($real_peer, $port) = ($this_peer, $try_port); last RETRY; } lwarn "SSH to '$this_peer' does not work on port '$try_port'\n"; } } ldie "SSH to '$peer' does not work\n" unless defined($port); $ssh_ips{$peer} = $real_peer; $ssh_ports{$peer} = $port; } my $real_peer = $ssh_ips{$peer}; $real_peer = $peer if !defined($real_peer); my $port = $ssh_ports{$peer}; $ssh .= " -p $port" if $port > 0; return ($real_peer, $port, $ssh) if defined($no_login); return "$ssh root\@$real_peer" } sub ssh_cmd { my ($peer, $cmd, $no_fail) = @_; return 1 unless $ssh_port; if ($peer ne $real_host) { $cmd =~ s/'/\'/g; $cmd = make_ssh_cmd($peer) . " '$cmd'"; } else { $cmd =~ s/'/\'/g; $cmd = "bash -c '$cmd'"; } ldie "empty command\n" unless $cmd; my $status = system($cmd); if ($status and !defined($no_fail)) { ldie "SSH to '$peer' command '$cmd' failed\n"; } return $status; } sub rsync_cmd { my ($peer, $cmd, $no_fail) = @_; return 1 unless $ssh_port; my ($real_peer, $port, $ssh) = make_ssh_cmd($peer, 1); $cmd =~ s/(? 0x00000001, "READING" => 0x00000002, "WRITING" => 0x00000004, "WRITE" => 0x00000008, "MAY_WRITE" => 0x00000010, "SKIP_SYNC" => 0x00000020, "NODATA" => 0x00000040, ); my %FEATURES_FLAGS = ( "CHKSUM_MD5_OLD" => 0x00010000, "CHKSUM_MD5" => 0x00020000, "CHKSUM_CRC32C" => 0x00040000, "CHKSUM_CRC32" => 0x00080000, "CHKSUM_SHA1" => 0x00100000, "COMPRESS_LZO" => 0x01000000, "COMPRESS_LZ4" => 0x02000000, "COMPRESS_ZLIB" => 0x04000000, ); my %FLAGS_FEATURES; my $chksum_features = 0x0; my $compress_features = 0x0; sub featuresflags2txt { my ($flags, $skip_unknown) = @_; $flags = 0x0 unless (defined($flags) && $flags ne ""); $flags = hex($flags) if $flags =~ m/^0x/; my $txt = ""; for (my $mask = 0x1; $mask < (1 << 32); $mask <<= 1) { my $bit = $flags & $mask; next unless $bit; if (defined($FLAGS_FEATURES{$bit})) { $txt .= "|" if $txt; $txt .= $FLAGS_FEATURES{$bit}; } elsif (!defined($skip_unknown)) { $txt .= "|" if $txt; $txt .= sprintf("0x%x", $bit); } } return $txt; } sub txt2featuresflags { my ($cmd, $txt) = @_; my $flags = 0x0; foreach my $this_txt (split(/\s*[|]\s*/, $txt)) { chomp $this_txt; $this_txt =~ s/^\s+//; ldie "Digest names must start with 'CHKSUM_'" if ($cmd =~ m/digest/ && $this_txt !~ m/^CHKSUM/); if (defined($FEATURES_FLAGS{$this_txt})) { $flags |= $FEATURES_FLAGS{$this_txt}; } elsif ($this_txt =~ m/^0x[0-9a-f]+/) { $flags |= hex($this_txt); } elsif ($this_txt =~ m/^0[0-9]+/) { $flags |= oct($this_txt); } elsif ($this_txt =~ m/^[0-9]+$/) { $flags |= $this_txt; } else { ldie "Unknown symbolic feature flag '$this_txt'\n"; } } return sprintf("0x%08x", $flags); } ################################################################## # Resource lists and their peers # 2-dimensional caches for cartesian product my %total_resources; my %member_resources; my %guest_resources; my %any_resources; my %total_peers; my %member_peers; my %guest_peers; my %any_peers; # classical 1-dimensional caches my %file_info; my %is_deleted_peer; my $cache_dir = "$mars/cache-$real_host"; sub __read_cache { my ($filename, $hash, $dimensions) = @_; $dimensions = 2 unless $dimensions; open(my $fh, "<", $filename) or return 0; while (my $line = <$fh>) { chomp $line; # start with the highest number of dimensions if ($dimensions == 2) { my ($key1, $key2, $val) = split(" ", $line, 3); $$hash{$key1}{$key2} = $val; } elsif ($dimensions == 1) { my ($key, $val) = split(" ", $line, 2); $$hash{$key} = $val; } else { lwarn "read_cache internal: bad #dimensions '$dimensions'\n"; return 0; } } close($fh); lprint "READ CACHE $filename\n" if $verbose >= 2; return 1; } sub _read_cache { return 0 if $kernel_strategy_version < 4; my $inval_path = "$cache_dir/invalid"; if (-l $inval_path) { _reset_resources(); return 0; } my $ok = __read_cache("$cache_dir/total_resources.cache", \%total_resources) && __read_cache("$cache_dir/member_resources.cache", \%member_resources) && __read_cache("$cache_dir/guest_resources.cache", \%guest_resources) && __read_cache("$cache_dir/any_resources.cache", \%any_resources) && __read_cache("$cache_dir/total_peers.cache", \%total_peers) && __read_cache("$cache_dir/member_peers.cache", \%member_peers) && __read_cache("$cache_dir/guest_peers.cache", \%guest_peers) && __read_cache("$cache_dir/any_peers.cache", \%any_peers) && __read_cache("$cache_dir/is_deleted_peer.cache", \%is_deleted_peer, 1); if (!$ok) { _reset_resources(); } return $ok; } sub __write_cache { my ($filename, $hash, $dimensions) = @_; $dimensions = 2 unless $dimensions; my $tmpname = "$filename.tmp.$$"; local $OFS = " "; local $ORS = "\n"; open(my $fh, ">", $tmpname) or return 0; foreach my $key1 (keys(%$hash)) { # start with the highest number of dimensions if ($dimensions == 2) { my $sub_hash = $$hash{$key1}; foreach my $key2 (keys(%$sub_hash)) { my $val = $$sub_hash{$key2}; if (!print $fh ($key1, $key2, $val)) { unlink($tmpname); return 0; } } } elsif ($dimensions == 1) { my $val = $$hash{$key1}; if (!print $fh ($key1, $val)) { unlink($tmpname); return 0; } } else { lwarn "write_cache internal: bad #dimensions '$dimensions'\n"; unlink($tmpname); return 0; } } if (!close($fh)) { unlink($tmpname); return 0; } rename($tmpname, $filename); return 1; } sub _write_cache { mkdir($cache_dir); my $ok = __write_cache("$cache_dir/total_resources.cache", \%total_resources) && __write_cache("$cache_dir/member_resources.cache", \%member_resources) && __write_cache("$cache_dir/guest_resources.cache", \%guest_resources) && __write_cache("$cache_dir/any_resources.cache", \%any_resources) && __write_cache("$cache_dir/total_peers.cache", \%total_peers) && __write_cache("$cache_dir/member_peers.cache", \%member_peers) && __write_cache("$cache_dir/guest_peers.cache", \%guest_peers) && __write_cache("$cache_dir/any_peers.cache", \%any_peers) && __write_cache("$cache_dir/is_deleted_peers.cache", \%is_deleted_peer, 1); if (!$ok) { _reset_resources(); } return $ok; } sub _scan_caches { return if %total_peers; return if _read_cache(); # Reset all 2-dimensional hashes %total_resources = (); %member_resources = (); %guest_resources = (); %any_resources = (); %total_peers = (); %member_peers = (); %guest_peers = (); %any_peers = (); # Reset 1-dimensional hashes %is_deleted_peer = (); my $ips_glob = "$mars/ips/ip-*"; my $resource_glob = "$mars/resource-*"; my $active_glob = "$mars/resource-*/{device,data,replay}-*"; my $probe_dir = "$mars/probe-$real_host"; # do not include too much outdated information if (-d $probe_dir && is_link_recent($probe_dir)) { $ips_glob .= " $probe_dir$ips_glob"; $resource_glob .= " $probe_dir$resource_glob"; $active_glob .= " $probe_dir$active_glob"; } # Add all known hosts to %total_peers but _not_ to %any_peers. # Reason: some hosts might not be member/guest of any resource foreach my $path (raw_glob($ips_glob)) { $path =~ m:/ip-(.*):; my $this_peer = $1; my $peer_ip = get_raw_link($path); my $deleted_peer = defined($peer_ip) && $peer_ip eq ".deleted"; if ($deleted_peer) { # ABOLUTE NOGO: the currently running host CANNOT be deleted if ($this_peer eq $real_host) { lwarn "IMPORTANT: this script is running under the REAL hostname '$real_host'\n"; lwarn "IMPORTANT: but my own entry '$path' is marked as '.deleted'\n"; lwarn "IMPORTANT: this is a NOGO and it CANNOT WORK!\n"; lwarn "IMPORTANT: the machine name (uname -a or /etc/hostname) must NEVER be changed!!!!!!!!\n"; lwarn "IMPORTANT: Either you are trying to revive a ZOMBIE MACHINE (maybe under a bad name),\n"; lwarn "IMPORTANT: OR you have a MISCONFIGURED hostname '$real_host' which can potentially DESTROY YOUR DATA,\n"; lwarn "IMPORTANT: OR you need a MANUAL REPAIR right now,\n"; lwarn "IMPORTANT: OR some other SERIOUS PROBLEM I cannot know.\n"; lwarn "IMPORTANT: only when 100% sure and CHECKED that nothing can go wrong (RTFM = Read The Fucking Manual, Luke!), you are _responsible_ for commands like 'marsadm lowlevel-set-host-ip' and friends\n"; } else { $is_deleted_peer{$this_peer} = 1; } } if ($this_peer =~ $match_reserved_id) { lwarn "IMPORTANT: for whatever reason, you have configured a syntactically invalid peer name '$this_peer' in /mars/ips/.\n"; lwarn "IMPORTANT: 'none' or '(none)' is a reserved name with a special meaning.\n"; lwarn "IMPORTANT: please fix this by hand. I cannot know how this has happened, and the reasons.\n"; lwarn "IMPORTANT: I am now continuing at YOUR risk, but expect some SERIOUS PROBLEMS.\n"; } if ($ignore_deleted_peers > 1) { # Important: deleted peers NEED to be kept out of some games. # Silently setting this to 2 can be dangerous. # Reason: when --ignore-deleted-peers is at the default ==1, we report some "greenwashed" picture to the users. # Non-experts might crash into some non-visible pitfalls. # For example, there are some exceptions where "deleted" peers are appearing # _regulalry_ although they are _not_ _really_ deleted, # like currently running split-cluster & co. # NOTICE: split-cluster is an exception by itself. It is deprecated anyway. my $is_peer_acceptable = !$is_deleted_peer{$this_peer} || is_link_recent($path, $window); if (!$is_peer_acceptable) { lwarn "Your risk as requested by --ignore-deleted-peers=$ignore_deleted_peers: skipping deleted peer '$this_peer'\n"; next; } } # All has been checked now: rember this peer. $total_peers{$this_peer} = {}; } # Add all known resources to %total_resources but _not_ to %any_resources. # Reason: some resources might exist but have no members / guests. foreach my $path (lamport_glob($resource_glob)) { $path =~ m:/resource-(.*):; my $this_res = $1; if ($this_res =~ $match_reserved_id) { lwarn "IMPORTANT: for whatever reason, you have configured a syntactically invalid resource name '$this_res' in directory '/mars'.\n"; lwarn "IMPORTANT: 'none' or '(none)' is a reserved name with a special meaning.\n"; lwarn "IMPORTANT: please fix this by hand. I cannot know how this has happened, and the reasons.\n"; lwarn "IMPORTANT: I am now continuing at YOUR risk, but expect some SERIOUS PROBLEMS.\n"; } $total_resources{$this_res} = {}; } # Now we look at all relevant combinations between resources and hosts my @total_paths = lamport_glob($active_glob); my %paths; foreach my $path (@total_paths) { # %paths hash: make $probe_dir variants globally unique $path =~ s/^$probe_dir//; $paths{$path} = 1; } foreach my $path (@total_paths) { next unless $path =~ m:/resource-([^/]+?)/[a-z]+-(.+):; my $this_res = $1; if ($this_res =~ $match_reserved_id) { lwarn "IMPORTANT: for whatever reason, you have configured a syntactically invalid resource name '$this_res' in directory '$probe_dir'.\n"; lwarn "IMPORTANT: this is from a (remote) probe, and there might be very strange reasons like network communication problems or whatever.\n"; lwarn "IMPORTANT: 'none' or '(none)' is a reserved name with a special meaning.\n"; lwarn "IMPORTANT: please fix this by hand. I cannot know how this has happened, and the reasons.\n"; lwarn "IMPORTANT: I am now continuing at YOUR risk, but expect some SERIOUS PROBLEMS.\n"; } my $this_peer = $2; if ($this_peer =~ $match_reserved_id) { lwarn "IMPORTANT: for whatever reason, you have configured a syntactically invalid peer name '$this_peer' in resource '$this_res'.\n"; lwarn "IMPORTANT: this is from a (remote) probe in directory '$probe_dir'.\n"; lwarn "IMPORTANT: there might be very strange reasons like network communication problems or defective filesystems or whatever.\n"; lwarn "IMPORTANT: 'none' or '(none)' is a reserved name with a special meaning.\n"; lwarn "IMPORTANT: please fix this by hand. I cannot know how this has happened, and the reasons.\n"; lwarn "IMPORTANT: I am now continuing at YOUR risk, but expect some SERIOUS PROBLEMS.\n"; } if ($ignore_deleted_peers > 1 && $is_deleted_peer{$this_peer}) { lwarn "Your risk as requested by --ignore-deleted-peers=$ignore_deleted_peers: skipping deleted peer '$this_peer'\n"; next; } # dynamic programming next if $total_resources{$this_res}{$this_peer}; # remember result combinations $total_resources{$this_res}{$this_peer} = 1; $total_peers{$this_peer}{$this_res} = 1; my $is_any = $paths{"$mars/resource-$this_res/device-$this_peer"}; if ($is_any) { $any_resources{$this_res}{$this_peer} = 1; $any_peers{$this_peer}{$this_res} = 1; } my $is_member = $paths{"$mars/resource-$this_res/data-$this_peer"} || $paths{"$mars/resource-$this_res/replay-$this_peer"}; if ($is_member) { $member_resources{$this_res}{$this_peer} = 1; $member_peers{$this_peer}{$this_res} = 1; next; } my $is_guest = $is_any && get_link("$mars/resource-$this_res/actual-$this_peer/prosumer-on", 2) || get_link("$mars/resource-$this_res/todo-$this_peer/exports", 2) =~ m:(^|\+)$this_peer($|\+):; if ($is_guest) { $guest_resources{$this_res}{$this_peer} = 1; $guest_peers{$this_peer}{$this_res} = 1; next; } # Notice: _candidates_ for guests are over here. # They can be determined by set_minus(%any_peers,%member_peers) } if ($verbose) { lprint "====== found " . scalar(keys(%total_peers)) . " total and " . scalar(keys(%member_peers)) . " participating and " . scalar(keys(%guest_peers)) . " guest " . "peers\n"; lprint "====== found" . scalar(keys(%total_resources)) . " total and " . scalar(keys(%member_resources)) . " participating and " . scalar(keys(%guest_resources)) . " guest " . "resources\n"; } if (!_write_cache()) { lwarn "cannot write peer cache\n"; } } sub _reset_resources { system("rm -rf $cache_dir/*.cache"); %total_peers = (); %file_info = (); } sub is_member { my ($res, $peer) = @_; _scan_caches() unless %total_peers; return $member_resources{$res}{$peer}; } sub is_guest { my ($res, $peer) = @_; _scan_caches() unless %total_peers; return $guest_resources{$res}{$peer}; } sub is_any { my ($res, $peer) = @_; _scan_caches() unless %total_peers; return $any_resources{$res}{$peer}; } sub alphanum_cmp { my ($aa, $bb) = ($a, $b); $aa =~ s/([0-9]+)/sprintf("%012d",$1)/eg; $bb =~ s/([0-9]+)/sprintf("%012d",$1)/eg; return $aa cmp $bb; } sub reverse_cmp { my ($aa, $bb) = ($b, $a); $aa =~ s/([0-9]+)/sprintf("%012d",$1)/eg; $bb =~ s/([0-9]+)/sprintf("%012d",$1)/eg; return $aa cmp $bb; } sub get_total_resources { my $peer = shift; $peer = "" if $peer =~ $match_reserved_id; _scan_caches() unless %total_peers; if ($peer) { my $projection = $total_peers{$peer}; return sort alphanum_cmp keys(%$projection); } else { return sort alphanum_cmp keys(%total_resources); } } sub get_member_resources { my $peer = shift; $peer = "" if $peer =~ $match_reserved_id; _scan_caches() unless %total_peers; if ($peer) { my $projection = $member_peers{$peer}; return sort alphanum_cmp keys(%$projection); } else { return sort alphanum_cmp keys(%member_resources); } } sub get_guest_resources { my $peer = shift; $peer = "" if $peer =~ $match_reserved_id; _scan_caches() unless %total_peers; if ($peer) { my $projection = $guest_peers{$peer}; return sort alphanum_cmp keys(%$projection); } else { return sort alphanum_cmp keys(%guest_resources); } } sub get_any_resources { my $peer = shift; $peer = "" if $peer =~ $match_reserved_id; _scan_caches() unless %total_peers; if ($peer) { my $projection = $any_peers{$peer}; return sort alphanum_cmp keys(%$projection); } else { return sort alphanum_cmp keys(%any_resources); } } sub get_total_peers { my $res = shift; _scan_caches() unless %total_peers; if ($res) { my $projection = $total_resources{$res}; return sort alphanum_cmp keys(%$projection); } else { return sort alphanum_cmp keys(%total_peers); } } sub get_member_peers { my $res = shift; _scan_caches() unless %total_peers; if ($res) { my $projection = $member_resources{$res}; return sort alphanum_cmp keys(%$projection); } else { return sort alphanum_cmp keys(%member_peers); } } sub get_guest_peers { my $res = shift; _scan_caches() unless %total_peers; if ($res) { my $projection = $guest_resources{$res}; return sort alphanum_cmp keys(%$projection); } else { return sort alphanum_cmp keys(%guest_peers); } } sub get_any_peers { my $res = shift; _scan_caches() unless %total_peers; if ($res) { my $projection = $any_resources{$res}; return sort alphanum_cmp keys(%$projection); } else { return sort alphanum_cmp keys(%any_peers); } } sub key_intersect { my ($hash1, $hash2) = @_; my %h1 = %$hash1; my %h2 = %$hash2; my %result = %h1; foreach my $key (keys(%h2)) { delete $result{$key}; } return %result; } ################################################################## # HACK for 1&1 specific cm3 detection, to disappear (hopefully) my $systemd_recursive_call = defined($ENV{SYSTEMD_RECURSIVE_CALL}) ? $ENV{SYSTEMD_RECURSIVE_CALL} : "clustermanagerd|nodeagent\\.real"; my $cm3_checked = 0; my $cm3_recursive = 0; sub is_called_recursive { if (!$cm3_checked) { $cm3_checked = 1; my $ppid = getppid(); if ($ppid == 1) { return 1; } my $tree = `/usr/bin/pstree $ppid`; my $recursive_call = ($tree && $tree =~ m{$systemd_recursive_call}); $cm3_recursive = $recursive_call; } return $cm3_recursive; } sub _systemd_enabled { if (!$cm3_checked && is_called_recursive()) { $systemd_enabled = 0; } return $systemd_enabled; } ################################################################## # dynamic systemd control my $systemd_subdir = defined($ENV{SYSTEMD_SUBDIR}) ? $ENV{SYSTEMD_SUBDIR} : "systemd-templates"; my $generated_scripts_subdir = defined($ENV{SYSTEMD_SCRIPTS_SUBDIR}) ? $ENV{SYSTEMD_SCRIPTS_SUBDIR} : "systemd-generated-scripts"; my $predefined_unit_path = "/etc/systemd/system,/run/systemd/system,/usr/lib/systemd/system"; my $systemd_system_dirs = # prefer the "offical" systemd path as documented in "man systemd.unit" defined($ENV{SYSTEMD_UNIT_PATH}) ? join(",", split(":", $ENV{SYSTEMD_UNIT_PATH})) . ( # ending in ":" means to append the predefined path (as documented) $ENV{SYSTEMD_UNIT_PATH} =~ m/:$/ ? $predefined_unit_path : "" ) : # allow mars-specific overrides defined($ENV{SYSTEMD_SYSTEM_DIRS}) ? $ENV{SYSTEMD_SYSTEM_DIRS} : $predefined_unit_path; my $systemd_target_dir = defined($ENV{SYSTEMD_TARGET_DIR}) ? $ENV{SYSTEMD_TARGET_DIR} : "/run/systemd/system"; my $systemd_suffixes = defined($ENV{SYSTEMD_SUFFIXES}) ? $ENV{SYSTEMD_SUFFIXES} : "service,socket,device,mount,automount,swap,target,path,timer,slice,scope,script"; # see man systemd.special my $systemd_predefined = defined($ENV{SYSTEMD_PREDEFINED}) ? $ENV{SYSTEMD_PREDEFINED} : "basic.target,bluetooth.target,cryptsetup-pre.target,cryptsetup.target,ctrl-alt-del.target," . "default.target,emergency.target,exit.target,final.target,getty.target,graphical.target," . "halt.target,hibernate.target,hybrid-sleep.target,initrd-fs.target,initrd-root-device.target," . "initrd-root-fs.target,kbrequest.target,kexec.target,local-fs-pre.target,local-fs.target," . "machines.target multi-user.target,network-online.target,network-pre.target,network.target," . "nss-lookup.target,nss-user-lookup.target,paths.target,poweroff.target,printer.target," . "reboot.target,remote-cryptsetup.target,remote-fs-pre.target,remote-fs.target,rescue.target," . "rpcbind.target,runlevel2.target,runlevel3.target,runlevel4.target,runlevel5.target," . "shutdown.target,sigpwr.target,sleep.target,slices.target,smartcard.target,sockets.target," . "sound.target,suspend.target,swap.target,sysinit.target,syslog.socket,system-update.target," . "time-sync.target,timers.target,umount.target," . "system.slice,user.slice,machine.slice," . "dbus.service,dbus.socket,display-manager.service,system-update-cleanup.service"; my $systemd_watcher_units = defined($ENV{SYSTEMD_WATCHER_UNITS}) ? $ENV{SYSTEMD_WATCHER_UNITS} : "mars-\@escvar\{res}-trigger.path"; my %predefined_unit; foreach my $name (split(",", $systemd_predefined)) { $predefined_unit{$name} = 1; } my $systemctl = defined($ENV{SYSTEMCTL}) ? $ENV{SYSTEMCTL} : "systemctl"; my $systemd_escape = defined($ENV{SYSTEMD_ESCAPE}) ? $ENV{SYSTEMD_ESCAPE} : "@"; my $systemd_incape = defined($ENV{SYSTEMD_INCAPE}) ? $ENV{SYSTEMD_INCAPE} : "\\^"; my $systemd_dependencies = defined($ENV{SYSTEMD_DEPENDENCIES}) ? $ENV{SYSTEMD_DEPENDENCIES} : "Unit|Service|Slice|Sockets|Requires|Requisite|Wants|BindsTo|PartOf|Conflicts|Before|After|OnFailure|PropagatesReloadTo|ReloadPropagatedFrom|JoinsNamespaceOf|RequiresMountsFor|Alias|WantedBy|RequiredBy|Also|DefaultInstance|# ALSO"; my %template_names; my %template_files; my $nr_templates = 0; my $template_max_stamp = 0; sub get_template_files { if (!%template_files) { foreach my $dir (@MARS_PATH) { my $subdir = "$dir/$systemd_subdir"; $subdir = $dir unless -d $subdir; next unless -d $subdir; lprint "==== scanning template directory '$subdir'\n" if $verbose; foreach my $template_file (lamport_glob("$subdir/*.{$systemd_suffixes}")) { my $template_name = $template_file; $template_name =~ s:^.*/::; next unless $template_name; # Only the first hit will win when the same template is in multiple dirs. next if defined($template_files{$template_name}); $template_names{$template_file} = $template_name; $template_files{$template_name} = $template_file; lprint "== found template '$template_name' at '$template_file'\n" if $verbose > 1; my $mtime = get_stamp($template_file); $template_max_stamp = $mtime if $mtime > $template_max_stamp; $nr_templates++; } } } lprint "==== found $nr_templates templates\n" if $verbose; return sort alphanum_cmp keys(%template_names); } my $stable_pid; sub get_instance_files { my ($dir) = @_; $stable_pid = $$ unless $stable_pid; my $glob = "$dir/{*.{$systemd_suffixes},.pre.$stable_pid.*.{$systemd_suffixes}.tmp}"; lprint "GLOB '$glob'\n" if $verbose > 2; my %instance_files; foreach my $instance_file (lamport_glob($glob)) { my ($mtime, $text) = _get_file($instance_file); next unless $text =~ m/^\#\#\# GENERATED FROM: (.+)$/m; my $template_file = $1; my $instance_name = $instance_file; $instance_name =~ s:^.*/::; lprint " INSTANCE '$instance_name'\n" if $verbose > 2; $instance_files{$instance_name} = [$instance_file, $mtime, $template_file]; } return %instance_files; } my %systemd_names; my %systemd_files; sub get_systemd_files { if (!%systemd_names) { foreach my $systemd_file (lamport_glob("{$systemd_system_dirs}/*.{$systemd_suffixes}")) { next if $systemd_file =~ m:$systemd_target_dir:; next if $systemd_file =~ m:$etc_marsadm/$generated_scripts_subdir:; my $systemd_name = $systemd_file; $systemd_name =~ s:^.*/::; $systemd_names{$systemd_file} = $systemd_name; $systemd_files{$systemd_name} = $systemd_file; } } return sort alphanum_cmp keys(%systemd_names); } sub systemctl { my ($args, $verb) = @_; $verb = $verbose unless defined($verb); $args =~ m/^\w+\s*(.*)/; my $rest = $1; my $ord_cmd = "$systemctl $args"; my $cmd = $ord_cmd; my $busy_cmd = ($args =~ m/^(start|stop)/); if ($busy_cmd) { $cmd = "$systemctl --job-mode=fail $args"; } lprint "executing: '$cmd'\n" if $verb > 1; my $nr_retry = 0; my $status; retry: eval { $status = system($cmd); }; if ($status && $busy_cmd) { lwarn "command '$cmd' failed with status=$status\n"; # linear backoff $nr_retry++; sleep($nr_retry); if ($nr_retry <= 5) { goto retry; } elsif ($cmd ne $ord_cmd) { $cmd = $ord_cmd; goto retry; } } return $status; } my %failed; sub _systemd_escape { my ($txt) = @_; my $replac = `systemd-escape --path "$txt"`; chomp $replac; return $replac; } sub subst_systemd_vars { my ($env, $text) = @_; my $parsed = ""; while ($text =~ m/[$systemd_escape]([A-Za-z_][-A-Za-z0-9_]*)?[{]($match_inner)[}]/ps) { my $name = $1 || ""; my $body = $2; $parsed .= $PREMATCH; my $rest = $POSTMATCH; my $this_escape = 0; my $replac; $_ = $name; PRE_SWITCH: { if (/^escvar$/) { $name = ""; $this_escape = 1; last PRE_SWITCH; } if (/^esc$/) { $name = "verbatim"; $this_escape = 1; last PRE_SWITCH; } } $_ = $name; SWITCH: { if (/^eval$/) { $replac = parse_macro($body, $env); last SWITCH; } if (/^$/) { my $varname = parse_macro($body, $env); $replac = $$env{$varname}; if (!defined($replac)) { lwarn "variable '$varname' is undefined\n" unless defined($failed{$varname}); $failed{$varname} = 1; $replac = "UNDEFINED($varname)"; } lprint " subst '$systemd_escape\{$varname\}' => '$replac'\n" if $verbose > 9; last SWITCH; } if (/^verbatim$/) { $replac = $body; last SWITCH; } lwarn "systemd function '$name' is undefined\n"; $replac = $body; } if ($this_escape) { my $orig = $replac; $replac = _systemd_escape($replac); lprint " escape '$orig' => '$replac'\n" if $verbose > 9; } $parsed .= $replac; $text = $rest; } return ($env, $parsed . $text); } sub match_systemd_vars { my ($env, $pattern, $text) = @_; ($env, $pattern) = subst_systemd_vars($env, $pattern); ($env, $text) = subst_systemd_vars($env, $text); my @names; my $regex = ""; while ($pattern =~ m/[$systemd_incape][{]([A-Za-z_][A-Za-z0-9_]*)[}]/ps) { my $name = $1; my $pre = $PREMATCH; my $post = $POSTMATCH; push @names, $name; $regex .= $pre . "(.+)"; $pattern = $post; } $regex .= $pattern; $regex =~ s/\\/\\\\/g; my $nr = 1; if ($text =~ m/^$regex$/p) { foreach my $name (@names) { my $val; eval "\$val = \$$nr;"; $$env{$name} = $val; $nr++; } return ($env, $text); } return (undef, $text); } sub _make_var_name { my ($file_name) = @_; chomp $file_name; $file_name =~ s:^.*/::; my $abs_dir = $systemd_target_dir; if ($file_name =~ m/\.script$/) { $abs_dir = "$etc_marsadm/$generated_scripts_subdir"; } mkdir $abs_dir; $stable_pid = $$ unless $stable_pid; my $res_file = "$abs_dir/$file_name"; my $pre_file = "$abs_dir/.pre.$stable_pid.$file_name.tmp"; my $tmp_file = "$abs_dir/.tmp.$stable_pid.$file_name.tmp"; return ($res_file, $pre_file, $tmp_file); } sub _get_file { my ($filename) = @_; my $cached = $file_info{$filename}; if ($cached) { return @$cached; } my $IN; local $/; # slurp if (!open($IN, "<", $filename)) { lwarn "cannot open file '$filename' ($!)\n"; return (0, ""); } my $mtime = get_stamp($IN); my $text = <$IN>; close($IN); $file_info{$filename} = [$mtime, $text]; return ($mtime, $text); } my %referenced_units; my %shortcut_units; sub _scan_refs { my ($text) = @_; while ($text =~ s/^[#][#][#]\s*REF:\s*(.*)//m) { my $next_unit = $1; # Remember the encountered name $referenced_units{$next_unit} = 1; $text = $POSTMATCH; } } sub _instantiate_systemd_unit { my ($env, $template_file, $out_name) = @_; my ($res_file, $pre_file, $tmp_file) = _make_var_name($out_name); lprint "==== Translate systemd template '$template_file' => '$res_file'\n" if $verbose; my ($mtime, $text) = _get_file($template_file); if (!$text) { lwarn "cannot get template '$template_file'\n"; return (0, ""); } # Check timestamps my $old_text; my $old_mtime; my $IN; if (open($IN, "<", $res_file)) { $old_mtime = get_stamp($IN); local $/; # slurp my $old_text = <$IN>; close($IN); if ($old_mtime && $old_mtime == $mtime) { lprint "== systemd unit '$res_file' has unchanged mtime=$mtime\n" if $verbose; $shortcut_units{$res_file} = 1; _scan_refs($old_text); return (1, $res_file); } lprint "== systemd unit '$res_file' mtime $old_mtime => $mtime\n" if $verbose; } my %this_references; my $header; $header = "### GENERATED FROM: $template_file\n"; $header .= "### GENERATED NAME: $out_name\n"; $header .= "### GENERATED TO: $res_file\n"; $header .= "### TEMPLATE MTIME: $mtime\n"; $header .= "###\n"; ($env, $text) = subst_systemd_vars($env, $text); my $scan = $text; while ($scan =~ m/^\s*($systemd_dependencies)\s*=\s*(.*?)$/mp) { my $next_unit_list = $2; $scan = $POSTMATCH; while ($next_unit_list =~ m/[^\s]+/) { my $next_unit = $MATCH; $next_unit_list = $POSTMATCH; # some units like mount units may be specified as paths. $next_unit = _systemd_escape($next_unit) if $next_unit =~ m:/:; next if $this_references{$next_unit}; $this_references{$next_unit} = 1; lprint "-- '$template_file' found reference to '$next_unit'\n" if $verbose > 2; # Remember the encountered name $referenced_units{$next_unit} = 1; $header .= "### REF: $next_unit\n"; } } $header .= "###\n"; if ($text =~ m/^([#][!].+)/p) { my $hash_bang = $1; $text = $hash_bang . "\n" . $header . $POSTMATCH; } else { $text = $header . $text; } # new $text is finished, write when necessary. if ($old_text && $old_text eq $text) { lprint "== systemd unit '$res_file' has not changed\n" if $verbose; $shortcut_units{$res_file} = 1; _scan_refs($text); return (1, $res_file); } if (!open(OUT, ">", $tmp_file)) { lwarn "cannot create '$tmp_file'\n"; unlink($tmp_file); return (0, ""); } unless (print OUT $text) { lwarn "cannot write '$tmp_file'\n"; close(OUT); unlink($tmp_file); return (0, ""); } unless (close(OUT)) { lwarn "cannot close '$tmp_file'\n"; unlink($tmp_file); return (0, ""); } # I would like to use 0400 instead, but this leads to masses of warnings like # Configuration file /run/systemd/system/daemon-reload.service is marked world-inaccessible. # This has no effect as configuration data is accessible via APIs without restrictions. # Proceeding anyway. my $perm = 0444; if ($res_file =~ m/\.script$/) { $perm = 0544; } chmod($perm, $tmp_file); utime($mtime, $mtime, $tmp_file); unless (rename($tmp_file, $pre_file)) { lwarn "cannot rename '$tmp_file' to '$pre_file'\n"; unlink($tmp_file); return (0, ""); } lprint "==== Translated systemd template '$template_file' => '$res_file'\n" if $verbose; return (1, $res_file); } my %generated_units; sub make_systemd_unit { my ($cmd, $res, $target, $force_generate) = @_; return 0 if $predefined_unit{$target}; # dynamic programming $res = "" unless defined($res); my $gen_key = "$cmd.$res.$target"; if (defined($generated_units{$gen_key})) { lprint "systemd unit '$gen_key' already generated\n" if $verbose; return $generated_units{$gen_key}; } my @res_list; if ($res) { @res_list = ($res); } else { @res_list = get_any_resources($host); } my ($found_env, $found_template_file, $found_subst); lprint "==== searching templates for '$target'\n" if $verbose; foreach my $template_file (get_template_files()) { my $template_name = $template_names{$template_file}; next unless $template_name; # avoid exhaustive search if ($template_name =~ m/^([A-Za-z0-9_.]+)/) { my $prefix = $1; next unless $target =~ m/^$prefix/; } foreach my $res (@res_list) { ($template_name, my $env) = make_env($cmd, $res, $template_name); my $subst = $template_name; (my $new_env, $subst) = match_systemd_vars($env, $template_name, $target); if ($new_env) { ($found_env, $found_template_file, $found_subst) = ($new_env, $template_file, $subst); goto found; } } } found: if (!$found_template_file) { foreach my $template_file (get_template_files()) { my $template_name = $template_names{$template_file}; next unless $template_name; foreach my $res (@res_list) { ($template_name, my $env) = make_env($cmd, $res, $template_name); my $subst = $template_name; (my $new_env, $subst) = match_systemd_vars($env, $template_name, $target); if ($subst) { # Check if already installed somewhere else get_systemd_files() unless %systemd_files; if (defined($systemd_files{$subst})) { lprint "systemd unit '$subst' is already present at '$systemd_files{$subst}'\n" if $verbose; $generated_units{$gen_key} = 0; return 0; } } } } my $do_warn = lamport_glob("$mars/resource-*/systemd-*-unit"); lwarn "cannot find any systemd template for target unit '$target'\n" if $do_warn; $generated_units{$gen_key} = 0; return 0; } my ($out_env, $out_name) = subst_systemd_vars($found_env, $found_subst); lprint "==== instantiating template '$found_template_file' as '$out_name'\n" if $verbose; my ($nr, $file, $name) = _instantiate_systemd_unit($out_env, $found_template_file, $out_name); $generated_units{$gen_key} = $nr; return $nr; } sub set_systemd_want_phase1 { my ($cmd, $res, $new) = @_; if ($new ne "(none)") { my $host_path = "$mars/ips/ip-$new"; unless (get_link($host_path, 1)) { lwarn "The hostname '$new' does not exist.\n"; ldie "Refusing to set systemd-want.\n" unless $force; } } my $unit_glob = "$mars/resource-$res/systemd-*-unit"; unless (lamport_glob($unit_glob)) { lwarn "Resource '$res' has no configured systemd units.\n"; lwarn "First configure the resource with marsadm set-systemd-unit.\n"; ldie "Refusing to set systemd-want.\n" unless $force; } my $want_path = "$mars/resource-$res/systemd-want"; my $old = get_link($want_path, 1); return 0 if ($old && $old eq $new); set_link($new, $want_path); finish_links(); return 0; } sub set_systemd_want_phase2 { my ($cmd, $res) = @_; _trigger(3); my $want_path = "$mars/resource-$res/systemd-want"; my $new = get_link($want_path, 1); if ($force) { lprint "De/activation of '$want_path' -> '$new' is not checked due to --force\n"; return 0; } if ($new eq "(none)") { my $fail = 0; my $touch = 0; foreach my $peer (get_any_peers($res)) { next if $peer =~ $match_reserved_id; my $open_count_path = "$mars/resource-$res/actual-$peer/open-count"; my $device_in_use = get_link($open_count_path, 1); if ($device_in_use) { my $name = _device_name($res, $peer); lprint "Device '$name' is in use at '$peer'\n"; if ($peer eq $real_host) { $fail = 1; } else { # mistrust unreachable peers my ($dead_count, $alive_count, $unknown_count) = is_cluster_recent($cmd, $res, $peer); if ($alive_count) { $fail = 1; } else { lwarn "however peer '$peer' is not reachable, continuing for livelock avoidance.\n"; $touch++; } } } } if ($fail || $touch) { lprint "Re-trigger $want_path\n"; systemd_any_trigger($cmd, $res); } else { lprint "All peer devices are closed.\n"; } return $fail; } my $name = _device_name($res, $new); my $dev_present = device_exists($res, $new); if (!$dev_present) { lwarn "device '$name' is not present at '$new'\n"; return 0; } my $fail = 0; my $open_count_path = "$mars/resource-$res/actual-$new/open-count"; my $device_in_use = get_link($open_count_path, 1); if (!$device_in_use) { lprint "Device '$name' not yet opened at '$new'\n"; systemd_any_trigger($cmd, $res); # mistrust unreachable peer if ($new eq $real_host) { $fail = 1; } else { my ($dead_count, $alive_count, $unknown_count) = is_cluster_recent($cmd, $res, $new); if ($alive_count) { $fail = 1; } else { lwarn "however peer '$new' is not reachable, continuing for livelock avoidance.\n"; } } } else { lprint "Device '$name' is open at '$new'.\n"; } return $fail; } sub systemd_present { my ($cmd, $res) = @_; return "" unless _systemd_enabled(); my $unit_glob = "$mars/resource-$res/systemd-*-unit"; return "" unless lamport_glob($unit_glob); my $want_path = "$mars/resource-$res/systemd-want"; my $want = get_link($want_path, 1); return $want; } sub get_systemd_want { my $txt = systemd_present(@_); lprint "$txt\n"; } sub systemd_exists { my ($unit_list) = @_; return 0 unless _systemd_enabled(); foreach my $unit (split(/ +/, $unit_list)) { my $check_cmd = "list-unit-files \"$unit\" | wc -l"; my $count = `$systemctl $check_cmd`; if (!defined($count) || !$count || $count <= 0) { lprint "nothing to do for systemd, unit file '$unit' does not exist.\n"; return 0; } } return 1; } sub systemd_unit_enabled { my ($unit_list) = @_; return 1 unless _systemd_enabled(); foreach my $unit (split(/ +/, $unit_list)) { # .script is assumed as always enabled next if $unit =~ m/\.script$/; my $check_cmd = "is-enabled '$unit' > /dev/null 2>&1"; my $status = systemctl($check_cmd, 0); if ($status) { lprint "systemd unit '$unit' is not existing or not enabled.\n"; return $status; } } return 0; } sub _check_unit_marker { my ($file, $marker) = @_; my ($mtime, $text) = _get_file($file); my $found = ($text =~ m/^[#]\s*$marker/m); return $found; } my %systemctl_pid = (); sub _systemd_op_wait { my ($kill_after_timeout) = @_; return 0 unless %systemctl_pid; my $errors = 0; my $success = 0; my $old_timeout = $timeout; my $signal = "SIGTERM"; my $backoff = 1; for (;;) { my @wait_list = keys(%systemctl_pid); last unless @wait_list; foreach my $pid (@wait_list) { use POSIX ":sys_wait_h"; my $check_pid = waitpid($pid, WNOHANG); my $status = $?; if ($status > 0) { lwarn "UNIT CHILD $pid terminated with status=$status\n"; delete $systemctl_pid{$pid}; $backoff = 1; $errors++; } elsif ($check_pid == $pid) { lprint_stderr "UNIT CHILD $pid terminated successfully\n"; delete $systemctl_pid{$pid}; $backoff = 1; $success++; } } last unless %systemctl_pid; my $reached = sleep_timeout($backoff++, 1); if ($reached && $kill_after_timeout) { kill($signal, keys(%systemctl_pid)); $errors++; } last if $signal eq "SIGKILL"; # give up $signal = "SIGKILL"; $backoff = 1; $timeout = $old_timeout; } %systemctl_pid = (); $timeout = $old_timeout; if ($errors) { lwarn "systemctl: there were were $success successes and $errors errors.\n"; } else { lprint "There were $success successful systemctl operations.\n"; } return $errors; } sub _systemd_op { my ($op, $unit, $do_fork) = @_; return 0 unless _systemd_enabled(); my $has_forked = 0; if ($do_fork && !$child_prefix && $op =~ m/start|stop/) { my $pid = fork(); if (defined($pid)) { if ($pid) { # parent lprint_stderr "UNIT CHILD $pid: $op '$unit'\n"; $systemctl_pid{$pid} = "$op $unit"; return 0; } else { # child: simply continue $child_prefix = "UNIT $op: "; $has_forked = 1; } } } my $status = 0; # special case: .script templates are to be executed directly if ($unit =~ m/\.script$/) { my $dir = ""; foreach my $test_dir ("$etc_marsadm/$generated_scripts_subdir", split(",", $systemd_system_dirs)) { next unless -x "$test_dir/$unit"; $dir = $test_dir; last; } $dir .= "/" if $dir; my $cmd = "'$dir$unit' $op"; lprint "--- running script: '$cmd'\n"; $status = system($cmd); if ($status) { lwarn "script '$cmd' failed, status=$status\n"; } else { lprint "--- script status=$status\n"; } goto done; } if (systemctl("cat '$unit' > /dev/null 2>&1", 0)) { lwarn "systemd unit $unit does not exist.\n"; goto done; } my $ctl_cmd = "is-failed --quiet '$unit'"; my $fail_status = systemctl($ctl_cmd, 0); if (!$fail_status) { my $ctl_cmd = "reset-failed '$unit'"; $status = systemctl($ctl_cmd); lprint "--- resetting failed unit '$unit': status=$status\n"; } if ($op eq "start" || $op eq "restart") { if (systemd_unit_enabled($unit)) { goto done; } } $ctl_cmd = "$op '$unit'"; lprint "--- running systemctl command: $ctl_cmd\n"; $status = systemctl($ctl_cmd); if ($status) { lwarn "command '$systemctl $ctl_cmd' failed, status=$status\n"; } else { lprint "--- systemctl status=$status\n"; } done: # confine to 8bit if ($status < 0 || $status > 255) { $status = 255; lprint "--- correcting status=$status\n"; } exit($status) if $has_forked; return $status; } sub report_systemd_status { my ($cmd, $res, $status, $oper, $peer) = @_; $peer = $host unless $peer; my $response_path = "$mars/resource-$res/userspace/systemd-status-$oper-$peer"; # avoid useless path-triggers when nothing has changed my $old_val = get_link($response_path, 1); if ($old_val eq "" || $old_val != $status) { set_link($status, $response_path); } } sub __systemd_activate_want { my ($cmd, $res, $allow_fork, $override) = @_; return 0 unless _systemd_enabled(); my $want_path = "$mars/resource-$res/systemd-want"; my $want = get_link($want_path, 2); lprint "====== want '$want' for '$want_path'\n" if $verbose; if (!$want) { lprint "Nothing to (de)activate: $want_path does not exist\n" if $verbose; return 0; } my $do_activate = $want eq $host; if ($do_activate) { # Check for device existence if (!device_exists($res, $want)) { my $name = device_name($res, $want); lprint "==== device '$name' is not preset at '$want'\n" if $verbose; $do_activate = 0; } } if (defined($override)) { if ($override != $do_activate) { lprint "Overriding unit activate=$do_activate with $override\n" if $verbose; $do_activate = $override; } } elsif ($do_activate) { my $primary = _get_designated_primary($res); if ($primary ne $host) { # Do not activate for now lprint "Overriding systemd-want: '$host' is not designated primary,\n"; lprint "==== Do not modify resource '$res'\n" if $verbose; return 0; } } my $oper = $do_activate ? "start" : "stop"; my $unit_path = "$mars/resource-$res/systemd-$oper-unit"; my $unit = get_link($unit_path, 2); if (!$unit) { lprint "Nothing to (de)activate: $unit_path does not exist\n" if $verbose; return 0; } my $ctl_cmd = "show \"$unit\""; my $op = "show"; if ($do_activate) { $unit =~ s/ .*//; if (systemd_unit_enabled($unit)) { return 0; } lprint "==== Activate resource '$res' unit '$unit'\n" if $verbose; $op = "start"; } else { $unit =~ s/.* //; lprint "==== Deactivate resource '$res' unit '$unit'\n" if $verbose; $op = "stop"; } my $status = _systemd_op($op, $unit, $allow_fork); report_systemd_status($cmd, $res, $status, $oper); finish_links(); return $status; } sub __systemd_commit { my ($work_dir, $do_delete) = @_; lprint "==== Commit '$work_dir'\n" if $verbose; # Internal destination code: # -2 = needs stop + disable (e.g. deleted) # -1 = needs disable, but no status change # 0 = modified, no status change (for whatever reason) # 1 = new, to enable, no start # 2 = new, needs enable + start # absent = no modification my %changes; my %files = get_instance_files($work_dir); my %renames; my %deletes; my $need_reload = 0; foreach my $target (sort alphanum_cmp keys(%files)) { next if $shortcut_units{"$work_dir/$target"}; if ($target =~ m/^\.pre\.[0-9]+\.(.+?)\.tmp$/) { my $old_target = $1; my $new_target = $target; my ($new_instance, $new_mtime, $new_template) = @{$files{$new_target}}; if (defined($files{$old_target})) { lprint "-- '$old_target' is not new\n" if $verbose > 3; my ($old_instance, $old_mtime, $old_template) = @{$files{$old_target}}; if ($old_mtime == $new_mtime) { lprint "-- '$old_target' equal mtime=$new_mtime\n" if $verbose > 2; $deletes{new_target} = 1; next; } lprint "-- '$old_target' changed mtime from $old_mtime to $new_mtime\n" if $verbose; } $renames{$new_target} = $old_target; if (_check_unit_marker($new_instance, "ALWAYS_DISABLED")) { lprint "-- '$old_target' must remain disabled\n" if $verbose > 2; $changes{$old_target} = -1; next; } elsif (_check_unit_marker($new_instance, "ALWAYS_START")) { lprint "-- '$old_target' must be started\n" if $verbose > 2; $changes{$old_target} = 2; $need_reload++; next; } else { lprint "-- '$old_target' will be enabled, but not started\n" if $verbose > 2; $changes{$old_target} = 1; $need_reload++; } next; } $stable_pid = $$ unless $stable_pid; my $old_target = $target; my $new_target = ".pre.$stable_pid.$old_target.tmp"; my ($old_instance, $old_mtime, $old_template) = @{$files{$old_target}}; if (!defined($files{$new_target})) { if (!$do_delete) { lprint "-- ignoring '$old_target'\n" if $verbose > 2; next; } $deletes{$old_target} = 1; if (_check_unit_marker($old_instance, "KEEP_RUNNING")) { lprint "-- deleted '$old_target' is KEEP_RUNNING\n" if $verbose > 2; $changes{$old_target} = -1; next; } lprint "-- marking deleted '$old_target' for removal\n" if $verbose > 2; $changes{$old_target} = -2; $need_reload++; next; } if (_check_unit_marker($old_instance, "ALWAYS_DISABLED")) { lprint "-- '$old_target' is ALWAYS_DISABLED\n" if $verbose > 2; $changes{$old_target} = -1; next; } my ($new_instance, $new_mtime, $new_template) = @{$files{$new_target}}; my $ok = ($old_mtime == $new_mtime); if ($ok) { lprint "-- '$old_target' was not modified\n" if $verbose > 2; next; } lprint "-- '$old_target' was modified\n" if $verbose > 2; $changes{$old_target} = 0; $need_reload++; } # Cleanup the old situation. # This needs to be done in per-operation cycles, # because there may be inter-unit dependencies. lprint "==== Stopping old / deleted units\n" if $verbose; foreach my $unit (sort alphanum_cmp keys(%changes)) { my $op = $changes{$unit}; if ($op < -1) { _systemd_op("stop", $unit); } } lprint "==== Disabling old / deleted units\n" if $verbose; foreach my $unit (sort alphanum_cmp keys(%changes)) { my $op = $changes{$unit}; if ($op < 0) { _systemd_op("disable", $unit); } } # Commit foreach my $file (keys(%deletes)) { my $path = "$work_dir/$file"; lprint "--- unlink '$path'\n" if $verbose > 2; unlink($path); } foreach my $src (keys(%renames)) { my $dst = $renames{$src}; my $src_path = "$work_dir/$src"; my $dst_path = "$work_dir/$dst"; lprint "--- rename '$src_path' '$dst_path'\n" if $verbose > 2; rename($src_path, $dst_path); } %generated_units = (); # Tell the new situation to systemd. # This needs to be done in per-operation cycles, # because there may be inter-unit dependencies. if ($need_reload) { lprint "==== Restart systemd\n" if $verbose; _systemd_op("start", "daemon-reload.service"); } lprint "==== Enabling new units\n" if $verbose; foreach my $unit (sort alphanum_cmp keys(%changes)) { my $op = $changes{$unit}; if ($op > 0) { _systemd_op("enable", $unit); } } lprint "==== Starting new units\n" if $verbose; foreach my $unit (sort alphanum_cmp keys(%changes)) { my $op = $changes{$unit}; if ($op > 1) { _systemd_op("start", $unit); } } lprint "==== Done commit '$work_dir'\n" if $verbose; } sub systemd_commit { my ($do_delete) = @_; # We need separate target directories for templates and for scripts. # Reason: /run does not allow script execution on many systems. __systemd_commit($systemd_target_dir, $do_delete); my $script_dir = "$etc_marsadm/$generated_scripts_subdir"; __systemd_commit($script_dir, $do_delete); } # THINK: # Would per-resource incremental template generation be better? # Yes, it would scale better, but would it be _correct_? # # The problem is that the macro processor is Turing-complete. # # Thus there might be non-trivial cross-dependencies between generated # unit files. Potentially even cross-resource dependencies. # These might for example depend on non-trivial if conditions, # potentially even purely dynamic environment variables, or whatever. # # At the moment, the complete transitive closure is re-computed once # a small detail has changed. This is on the safe side, but not optimal. # There is certainly room for improvement. However be cautious # with respect to correctness under all cirumstances. # # Knuth is cited: "I can do it in half the time if it doesn't have # to be correct". sub __systemd_generate { my ($cmd, $res, $make_want, $make_watcher, $force_generate) = @_; return unless -d $mars; return unless -d $etc_marsadm; mkdir($systemd_target_dir); mkdir("$etc_marsadm/$generated_scripts_subdir"); $force_generate = 0 unless defined($force_generate); lprint "Generate all templates for '$res' force='$force_generate'.\n"; # Determine all template files. get_template_files(); # Always add all plain templates my %done_units; %referenced_units = (); my $count = 0; foreach my $template_name (sort alphanum_cmp keys(%template_files)) { next if $template_name =~ m/($systemd_incape|$systemd_escape)/; $count += make_systemd_unit($cmd, "UNDEFINED_RESOURCE", $template_name); $done_units{$template_name} = 1; } # Determine all participating resource names. my @res_list; my $do_delete = 0; if ($res) { @res_list = ($res); } else { @res_list = get_any_resources($host); # We can only delete when the full set of transitive dependecies is known. $do_delete = ($make_want && $make_watcher); } # Create initial systemd units foreach my $this_res (@res_list) { if ($make_want) { foreach my $unit_link (lamport_glob("$mars/resource-$this_res/systemd-*-unit")) { my $target = get_link($unit_link); next unless $target; $count += make_systemd_unit($cmd, $this_res, $target, $force_generate); $done_units{$target} = 1; } } if ($make_watcher) { foreach my $target_pattern (split(",", $systemd_watcher_units)) { my ($dummy, $start_env) = make_env($cmd, $this_res, $target_pattern); my ($env, $target) = subst_systemd_vars($start_env, $target_pattern); $count += make_systemd_unit($cmd, $this_res, $target, $force_generate); $done_units{$target} = 1; } } } # Compute the transitive closure of referenced units lprint "== adding transitive units for $count start units.\n" if $verbose; for (;;) { my $old_count = $count; foreach my $target (sort alphanum_cmp keys(%referenced_units)) { next if $done_units{$target}; $count += make_systemd_unit($cmd, undef, $target); $done_units{$target} = 1; } last if ($count <= $old_count); } lprint "== $count units generated.\n" if $verbose; # Check and commit the new situation systemd_commit($do_delete); } sub __systemd_want_ops { my ($cmd, $res) = @_; # Barrier, for safety _systemd_op_wait(); # Activate the listed units. my @res_list; if ($res && $res ne "all" && $res ne "*") { @res_list = ($res); } else { @res_list = get_any_resources($host); } my $allow_fork = scalar(@res_list) > 1; foreach my $res (@res_list) { __systemd_activate_want($cmd, $res, $allow_fork); } _systemd_op_wait(); } sub __systemd_touch_trigger { my ($cmd, $res) = @_; return if $cmd =~ m/extern/; return unless _systemd_enabled(); my $todo_global = "$mars/todo-global"; my $base_dir = "$mars/userspace"; my $want_path = "$todo_global/systemd-global-trigger"; my $trigger = "$base_dir/systemd-trigger"; if ($res && $res ne "all" && $res ne "*") { $base_dir = "$mars/resource-$res"; $want_path = "$base_dir/systemd-want"; $trigger = $want_path; } elsif (!-d $todo_global) { lwarn "fixing non-existing $todo_global\n"; mkdir($todo_global); set_link($host, $want_path); finish_links(); } mkdir($base_dir) unless -d $base_dir; lprint "Triggering '$trigger' for '$cmd'\n" if $verbose; safe_touch($want_path); # ensure that trigger file exists safe_touch($trigger) || safe_creat($trigger); } my %any_triggered; sub systemd_any_trigger { my ($cmd, $res) = @_; return unless _systemd_enabled(); $res = "" unless defined($res); __systemd_touch_trigger($cmd, $res); if (!$res) { my @res_list = get_any_resources($host); foreach my $this_res (@res_list) { __systemd_touch_trigger($cmd, $this_res); } } } sub systemd_trigger_extern { my ($cmd, $res) = @_; $res = "" unless defined($res); return unless -d $systemd_target_dir; my $called_external = ($cmd =~ m/extern/); if ($called_external) { $cm3_checked = 1; $systemd_enabled = 1; } elsif (is_called_recursive()) { return 0; } # Ensure disjointness of path watchers my $make_want = ($called_external && $res); my $make_watcher = (!$make_want || !$res); my $force_generate = !$called_external; eval { __systemd_generate($cmd, $res, $make_want, $make_watcher, $force_generate); }; if ($make_want) { lprint "Want '$res'\n"; __systemd_want_ops($cmd, $res); } else { lprint "Trigger '$res'\n"; systemd_any_trigger($cmd, $res); } return 0; } sub _get_systemd_unit { my ($cmd, $res) = @_; my $unit_start_path = "$mars/resource-$res/systemd-start-unit"; my $unit_stop_path = "$mars/resource-$res/systemd-stop-unit"; my $start_unit = get_link($unit_start_path, 2); my $stop_unit = get_link($unit_stop_path, 2); if (!$start_unit) { lprint_stderr "Nothing to show: start unit $unit_start_path does not exist\n" if $verbose; return ""; } if (!$stop_unit) { lwarn "Strange: stop unit $unit_stop_path does not exist\n"; return $start_unit; } return "$start_unit $stop_unit"; } sub get_systemd_unit { my $unit = _get_systemd_unit(@_); lprint "$unit\n" if $unit; } sub _get_default_unit { my ($cmd, $res, $marker) = @_; my $found = ""; foreach my $template_file (get_template_files()) { my $template_name = $template_names{$template_file}; next unless $template_name; if (!_check_unit_marker($template_file, $marker)) { next; } $found = $template_name; my ($dummy, $start_env) = make_env($cmd, $res, $template_name); my ($env, $subst) = subst_systemd_vars($start_env, $template_name); $found = $subst if $subst; last; } return $found; } sub set_systemd_unit_phase1 { my ($cmd, $res, $start_name, $stop_name) = @_; ldie "Start unit name is undefined\n" unless defined($start_name); # typically, this matches to capital DEFAULT my $default_regex = qr{^[A-Z][A-Z_0-9]*$}; my $use_default = ""; if ($start_name =~ m{$default_regex}) { $use_default = $start_name; my $marker = "${start_name}_START"; $start_name = _get_default_unit($cmd, $res, $marker); lprint "Marker '$marker' leads to start template '$start_name'\n"; } if ($use_default && !$stop_name) { my $marker = "${use_default}_STOP"; $stop_name = _get_default_unit($cmd, $res, $marker); lprint "Marker '$marker' (derived from start marker) leads to stop template '$stop_name'\n"; } elsif ($stop_name && $stop_name =~ m{$default_regex}) { my $marker = "${stop_name}_STOP"; $stop_name = _get_default_unit($cmd, $res, $marker); lprint "Marker '$marker' leads to stop template '$stop_name'\n"; } unless ($stop_name) { lprint "Unspecified stop unit: ALSO use start unit/template '$start_name' for stopping.\n"; $stop_name = $start_name; } my $unit_start_path = "$mars/resource-$res/systemd-start-unit"; my $unit_stop_path = "$mars/resource-$res/systemd-stop-unit"; # Convenience: try to fix any non-escaped resource names if ($start_name =~ m/($res)/p) { my $pre = $PREMATCH; my $post = $POSTMATCH; my $replac = _systemd_escape($res); $start_name = $pre . $replac . $post; } if ($stop_name =~ m/($res)/p) { my $pre = $PREMATCH; my $post = $POSTMATCH; my $replac = _systemd_escape($res); $stop_name = $pre . $replac . $post; } my $want_path = "$mars/resource-$res/systemd-want"; if ($start_name) { set_link($start_name, $unit_start_path); my $target = _get_designated_primary($res, 1); if ($target) { set_link($target, $want_path); } } else { _create_delete($unit_start_path); _create_delete($want_path); } if ($stop_name) { set_link($stop_name, $unit_stop_path); } else { _create_delete($unit_stop_path); _create_delete($want_path); } return 0; } sub set_systemd_unit_phase2 { my ($cmd, $res) = @_; finish_links(); systemd_trigger_extern($cmd, $res); return 0; } ################################################################## # path correction sub correct_path { my ($path) = @_; # actual switches $path =~ s:(/is-attach)[a-z]*$:$1ed:; $path =~ s:(/is-)(fetch)[a-z]*$:$1copy:; $path =~ s:(/is-)(apply)[a-z]*$:$1replay:; $path =~ s:(/is-(copy|replay|sync))[a-z]*$:$1ing:; $path =~ s:(/is-)(primary|secondary)[a-z]*$:$1primary:; # todo switches $path =~ s:(/fetch)[a-z]*$:/connect:; $path =~ s:(/apply)[a-z]*$:/allow-replay:; $path =~ s:(/replay)[a-z]*$:/allow-replay:; $path =~ s:(/todo-.*/(primary|secondary))[a-z]*$:/primary:; return $path; } ################################################################## # low-level infrastructure my @link_list = (); my %link_hash; sub get_raw_link { my ($path) = @_; my $result = readlink($path); return $result; } sub get_link { my ($path, $unchecked) = @_; my $result = readlink($path); if (!defined($result)) { ldie "cannot read symlink '$path'\n" unless $unchecked; lwarn "cannot read symlink '$path'\n" if $unchecked == 1; $result = ""; } $result = "" if $result eq ".deleted"; return $result; } sub get_link_stamp { my ($path) = @_; my @stat = lstat($path); return 0 if (!@stat); return $stat[9]; } sub get_stamp { my ($path_or_handle) = @_; my @stat = stat($path_or_handle); return 0 if (!@stat); return $stat[9]; } sub is_recent { my ($stamp, $wind) = @_; return 0 unless ($stamp && $stamp =~ m/^\s*[0-9.]/); $wind = $window * 2 unless $wind; return 1 if $stamp + $wind >= mars_time(); return 0; } sub is_link_recent { my ($path, $wind) = @_; my @stat = lstat($path); return 0 if (!@stat); return is_recent($stat[9], $wind); } sub get_alive_link { my ($name, $peer, $unchecked) = @_; my $path = "$mars/actual-$peer/$name"; my $result; if ($compat_alivelinks) { $result = get_link($path, 2); my $path_old = "$mars/$name-$peer"; my $result_old = get_link($path_old, 2); return $result_old if !$result; # determine the newer link if (get_link_stamp($path_old) > get_link_stamp($path)) { return $result_old if $result_old; } } else { $result = get_link($path, $unchecked); } return $result; } sub get_alive_stamp { my ($name, $peer) = @_; my $path = "$mars/actual-$peer/$name"; my $result = get_link_stamp($path); if ($compat_alivelinks) { my $path_old = "$mars/$name-$peer"; my $result_old = get_link_stamp($path_old); return $result_old if !$result; # determine the newer link if ($result_old > $result) { return $result_old; } } return $result; } sub alive_glob { my ($name, $hosts) = @_; $hosts = "*" unless $hosts; my %peers; foreach my $path (lamport_glob("$mars/actual-$hosts/$name")) { next unless $path =~ m:/actual-(.+)/:; my $peer = $1; next unless $peer; next if $peer =~ $match_reserved_id; $peers{$peer} = 1; } if ($compat_alivelinks) { foreach my $path (lamport_glob("$mars/$name-$hosts")) { next unless $path =~ m:/$name-(.+):; my $peer = $1; next unless $peer; next if $peer =~ $match_reserved_id; $peers{$peer} = 1; } } return sort alphanum_cmp keys(%peers); } sub to_tmp { my $path = shift; $path =~ s:^(.*)/:$1/.tmp.$$.:; return $path; } sub set_link { my ($src, $dst) = @_; # safeguard trailing slashes $dst =~ s:/$::; my $dst_tmp = to_tmp($dst); unlink($dst_tmp); symlink($src, $dst_tmp) or ldie "cannot create symlink '$dst' -> '$src'\n"; # the _order_ is important! remove existing intermediate element before re-appanding if (exists($link_hash{$dst})) { my @copy = @link_list; @link_list = (); foreach my $elem (@copy) { next if $elem eq $dst; push @link_list, $elem; } } $link_hash{$dst} = $src; push @link_list, $dst; } my %systemd_triggers = (); sub finish_links { my ($timestamp) = @_; return unless @link_list; $timestamp = mars_time() unless $timestamp; lprint "using lamport timestamp $timestamp\n" if $verbose; my $trigger_code = 1; my $count = 0; while (my $link = shift @link_list) { my $link_tmp = to_tmp($link); my $target = readlink($link_tmp); next unless defined($target); my $this_timestamp = $timestamp; unless (system("touch -h -d \"\@$this_timestamp\" $link_tmp") == 0) { lwarn "cannot set mtime on symlink '$link_tmp'\n"; } if ($dry_run) { lprint "DRY_RUN: would create symlink '$link' -> '$target'\n"; unlink($link_tmp); next; } # allow replacement of directories with symlinks rmdir($link) if -d $link; unless (rename($link_tmp, $link)) { lwarn "cannot finalize symlink '$link'\n"; } elsif ($verbose) { lprint "created symlink '$link' -> '$target'\n"; # When necessary, trigger the local systemd path watchers if ($link =~ m:/(primary,systemd):) { my $trigger = "$mars/userspace/systemd-trigger"; if ($link =~ m:^($mars/resource-[^/]+)/:) { $trigger = "$1/systemd-trigger"; } $systemd_triggers{$trigger}++; } } $count++; $trigger_code = 2 if $link =~ m:/(primary|todo-global|ip):; } _trigger($trigger_code) if $count > 0; foreach my $trigger (sort alphanum_cmp (keys(%systemd_triggers))) { lprint "Triggering '$trigger'\n" if $verbose; safe_touch($trigger) || safe_creat($trigger); } %systemd_triggers = (); } ################################################################## # global checks sub get_global_versions { unless (%FLAGS_FEATURES) { foreach my $txt (keys(%FEATURES_FLAGS)) { my $mask = $FEATURES_FLAGS{$txt}; $FLAGS_FEATURES{$mask} = $txt; $chksum_features |= $mask if $txt =~ m/^CHKSUM/; $compress_features |= $mask if $txt =~ m/^COMPRESS/; } } unless (defined($ARGV[0]) && $ARGV[0] =~ m/cluster|cat/) { my $act_dir = "$mars/actual-$host"; mkdir($act_dir) unless -d $act_dir; $kernel_version = get_alive_link("tree", $host, 1); if ($kernel_version && $user_version != $kernel_version) { lwarn "kernel_version=$kernel_version user_version=$user_version\n"; if ($user_version < $kernel_version) { ldie "Sorry, your MARS kernel module uses version $kernel_version, but my $0 userspace version is only $user_version. That cannot work. Please upgrade your userspace scripts!\n"; } elsif (int($user_version) > int($kernel_version) + 1) { ldie "MAJOR VERSION mismatch : only one major upgrade step is supported.\n"; } lwarn "using different minor versions is possible, but you should upgrade your kernel module ASAP\n"; } } # compute the mimimum of kernel features capabilities foreach my $peer (get_any_peers()) { next if $peer =~ $match_reserved_id; my $features = get_alive_link("features", $peer, 1); next unless (defined($features) && $features); next unless $features =~ m/^([0-9]+),?([0-9]*),?([x0-9a-f]*)/; $features = $1; my $strategy = $2; my $flags = $3; next unless $features > 0; if ($kernel_features_version < 0 || $features < $kernel_features_version) { $kernel_features_version = $features; } if (defined($strategy) && $strategy) { if ($strategy < $kernel_strategy_version || $kernel_strategy_version < 0) { $kernel_strategy_version = $strategy; } } else { $kernel_strategy_version = 0; } if (defined($flags) && $flags ne "") { $flags = hex($flags) if $flags =~ m/^0x/;; $kernel_flags_version &= $flags; } else { $kernel_flags_version = 0x0; } } # can ssh be switched off? if ($kernel_strategy_version >= 5) { $ssh_port = 0; } # determine cluster-wide $compat_* values if ($kernel_strategy_version >= 3) { $compat_alivelinks = 0; } my $compat_path = "$mars/compat-deletions"; $compat_deletions = get_link($compat_path, 2); if ($kernel_features_version < 3) { $compat_deletions = 1; } } sub get_alive_links { my $res = shift; my $alive = shift || "alive"; my $hosts = shift || "*"; my $warn = shift || 0; my $non_participating_peers = shift || 0; my $non_participating_resources = shift || 0; $res = "*" if (!$res || $res eq "all" || $res =~ m/,/); my $use_remote_stamp = $alive =~ s/^\^// ? 1 : 0; my @peer_list; if ($hosts eq $real_host) { # needed at join-cluster when nothing is known about others @peer_list = ($real_host); } elsif ($non_participating_peers) { @peer_list = get_total_peers(); } else { @peer_list = get_any_peers($res ne "*" ? $res : undef); } my %peers; foreach my $peer (@peer_list) { next if ($peer eq $host && $hosts ne $host); next if $peer =~ $match_reserved_id; # After join-cluster & co, links may take a while to appear $peers{$peer} = 1 if $non_participating_peers; # peer must be a candiate matching the hosts spec if ($hosts && $hosts ne "*") { next unless $peer =~ m/(^|[+,{}])$hosts($|[+,{}])/; } # OK: remember peer $peers{$peer} = 1; } $peers{$host} = 1 if $hosts eq $host; return %peers unless %peers; my %links; foreach my $peer (keys(%peers)) { my $val = get_alive_link($alive, $peer, 2); # When required and possible, get the _remote_ timestamp # when the link tree was read _remotely_. if ($use_remote_stamp) { my $remote_path = "$mars/actual-$peer/read-stamp"; my $remote_val = get_link($remote_path, 2); # check compatibility to old versions, and downgrades if ($compat_alivelinks && $remote_val) { my $path = "$mars/$alive-$peer"; my $stamp1 = get_link_stamp($path); my $stamp2 = get_link_stamp($remote_path); # Try to prefer the new remote stamp. # Only use the old one when too much outdated. if ($stamp2 && (!$stamp1 || $stamp2 + $window >= $stamp1)) { $val = $remote_val; } } } $links{$peer} = $val; } if ($warn) { my $any_projection = $any_peers{$host}; my $member_projection = $member_peers{$host}; my $guest_projection = $guest_peers{$host}; my %any_resources = defined($any_projection) ? %$any_projection : (); my %member_resources = defined($member_projection) ? %$member_projection : (); my %guest_resources = defined($guest_projection) ? %$guest_projection : (); my $now = mars_time(); my $extra_count = 0; foreach my $peer (keys(%peers)) { my $stamp = get_alive_link("time", $peer, 2); next if (!$stamp || $stamp !~ m/^\s*[0-9.]/ || is_recent($stamp)); my $ip_path = "$mars/ips/ip-$peer"; my $got_ip_link = get_link($ip_path, 1); my $common = 0; my @peer_resources = get_any_resources($peer); foreach my $tmp_res (@peer_resources) { # check for foreign resources if ($member_resources{$tmp_res} || $guest_resources{$tmp_res}) { $common++; last; } } if (!$got_ip_link) { my $link_state = "does not exist"; if (get_raw_link($ip_path)) { $link_state = "exists but marked as .deleted"; } if ($common) { lwarn "Common peer '$peer' $link_state in $mars/ips/ - check your cluster configuration\n"; } elsif ($verbose || $non_participating_peers || $non_participating_resources) { lwarn "Foreign peer '$peer' $link_state in $mars/ips/ - check your cluster configuration\n"; } } my $age = seconds2human($now - $stamp); if (!$common) { # Non-member peers are updated much less frequently, # thus we need a much larger time window. if (!is_recent($stamp, 3600)) { $extra_count++; if ($non_participating_resources || $verbose) { lwarn "Foreign / nonmember peer '$peer' not reachable for $age\n"; } } next; } my $msg = "no metadata is arriving from peer '$peer', age = $age"; $msg .= " => check your network setup" if is_module_loaded(); lwarn "$msg\n"; } if ($extra_count && ($verbose || $non_participating_peers || $non_participating_resources)) { lwarn "$extra_count foreign / non-member peers are NOT REACHABLE for >= 1 hour\n"; } } return %links; } ################################################################## # timeout handling # # return the lamport clock time in nanosecond resolution # fallback to system time() # sub mars_time { my $lamport_time; if (open(my $lamport_clock, "<", "/proc/sys/mars/lamport_clock")) { while (<$lamport_clock>) { $lamport_time = $1 if /^lamport_now=(.*)/; } close($lamport_clock); } return $lamport_time || time() . "." . '0' x 9; } my $timeout_val = undef; sub sleep_timeout { my $sleeptime = shift || 1; my $continue = shift; if ($timeout < 0) { sleep($sleeptime); return 0; } if (!defined($timeout_val)) { $timeout_val = $timeout; lprint "Resetting timeout to $timeout\n" if $verbose > 0; } if ($timeout_val <= 0) { $timeout_val = undef; if (!defined($continue) || !$continue) { ldie "Timeout reached.\n"; } lwarn "Timeout reached. Continuing anyway.\n"; return 1; } my $rest = $timeout_val; $rest = $sleeptime if $rest > $sleeptime; sleep($rest); $timeout_val -= $rest; return 0; } # wait for some condition sub wait_cond { my ($cmd, $res, $specific) = @_; my $is_actual = ($specific =~ s/^(is|has)-//); $specific =~ s/^todo-//; my $is_on = !($specific =~ s/-(off|0)$//); $specific =~ s/-(on|1)$//; if ($is_actual) { if ($specific eq "device") { check_mars_device($cmd, $res, 1, !$is_on); return; } my %table = ( "attach" => "is-attached", "attached" => "is-attached", "replay" => "is-replaying", "replaying"=> "is-replaying", "fetch" => "is-copying", "fetching" => "is-copying", "copy" => "is-copying", "copying" => "is-copying", "sync" => "is-syncing", "syncing" => "is-syncing", "primary" => "is-primary", "secondary"=> "is-primary", ); my $name = $table{$specific}; ldie "actual indicator '$specific' does not exist\n" unless exists($table{$specific}); $is_on = !$is_on if $name eq "secondary"; check_status($cmd, $res, $name, $is_on ? 1 : 0, 1, 1); } else { my %table = ( "attach" => "attach", "attached" => "attach", "fetch" => "fetch", "connect" => "fetch", "replay" => "replay", "sync" => "sync", "primary" => "primary", "secondary" => "secondary", ); my $name = $table{$specific}; ldie "button '$specific' does not exist\n" unless exists($table{$specific}); check_todo($cmd, $res, $name, $is_on ? 1 : 0, 1); } } # wait until everything is recent sub wait_cluster { my ($cmd, $res, $hosts, $abort, $trigger_code, $condition) = @_; $res = "all" if (!$res || $res =~ m/,/); $hosts = "*" unless $hosts; $abort = $force unless defined($abort); $trigger_code = ($hosts =~ m/\*/ ? 3 : 2) unless $trigger_code; my $non_participating = ($trigger_code >= 8); $timeout_val = undef; get_global_versions(); finish_links(); if ($hosts eq $real_host && $kernel_strategy_version >= 5) { return 0; } my $max_restarts = 3; my $max_resets = 1; restart: lprint "Ping $hosts trigger=$trigger_code\n"; _trigger($trigger_code); my %old_status = get_alive_links($res, "time", $hosts, 0, $non_participating); my $total_count = scalar(keys(%old_status)); my $start_time = mars_time(); my %progress; foreach my $peer (keys(%old_status)) { $progress{$peer} = $non_participating ? 1 : 0; } my $res_txt = ""; if ($non_participating) { $res_txt = "any known or unknown"; } elsif ($res eq "all") { $res_txt = "locally mentioned"; } else { $res_txt = "resource '$res'"; } my $count = scalar(keys(%old_status)); if ($count && $count < 10) { lprint "Wait for answers from $res_txt peer(s) " . join(",", sort alphanum_cmp keys(%old_status)) . "\n"; } else { lprint "Wait for answers from $count peer(s) ($res_txt)\n"; } my $delta = $window > 0 ? $window / 2 : 30; while (1) { # Early abort when condition is met if ($condition) { my $status = eval($condition); lprint "Condition '$condition' status=$status\n" if $verbose; return $status if $status; } my $dead_count = 0; my $alive_count = 0; my $unknown_count = 0; # Notice: prefer the remote timestamps for race prevention my %status = get_alive_links($res, "^time", $hosts, 0, $non_participating);; my $now = mars_time(); foreach my $peer (keys(%status)) { next if ($peer eq $host && $hosts ne $host); if (!$status{$peer}) { # This can happen at join-cluster & co my $ip_link = "$mars/ips/ip-$peer"; my $ip_link_stamp = get_link_stamp($ip_link); if ($ip_link_stamp && $ip_link_stamp + $delta < $now && $ip_link_stamp > 1) { $dead_count++; } elsif ($start_time + $delta < $now) { $dead_count++; } else { $unknown_count++; } next; } elsif (!$old_status{$peer}) { # Oh, only the old info is outdated, but we have a new one. # Also possible at join-cluster: restart freshly lwarn "Need restart for getting more 'time' links\n"; goto restart if $max_restarts-- > 0; } if ($status{$peer} !~ m/^[0-9.]+$/) { $dead_count++; next; } if ($status{$peer} > $old_status{$peer} && $status{$peer} > $start_time && $progress{$peer}-- <= 0) { $alive_count++; } elsif ($status{$peer} + $delta < $now && $old_status{$peer} + $delta < $now) { $dead_count++; } else { $unknown_count++; } } if ($unknown_count) { _trigger($trigger_code); # ensure more progress will happen %old_status = %status if !$max_resets++; } else { if (!$dead_count) { lprint "$alive_count/$total_count peer(s) seem to be alive\n"; } else { lwarn "$alive_count/$total_count peer(s) seem to be alive, and $dead_count peer(s) seem to be dead / not reachable\n"; ldie "aborting (--force was not given)\n" if $abort; } last; } sleep_timeout(1, 1); last if $timeout_val <= 0 && !$unknown_count; } return 0; } sub wait_cluster_noforce { my ($cmd, $res) = @_; if (!$force) { lprint "WAITING for communication\n" if $verbose; wait_cluster($cmd, $res, "*", 0); } } sub update_cluster { my ($cmd, $res, $hosts, $condition) = @_; $res = "all" if (!$res || $res =~ m/,/); $hosts = "*" unless $hosts; lprint "UPDATING $res\n" if $verbose; my $status = wait_cluster($cmd, $res, $hosts, 0, 8, $condition); _reset_resources(); return $status; } sub is_cluster_recent { my ($cmd, $res, $hosts) = @_; my $dead_count = 0; my $alive_count = 0; my $unknown_count = 0; my $now = mars_time(); my %status = get_alive_links($res, "time", $hosts); foreach my $peer (keys(%status)) { next if $peer eq $host; if ($status{$peer} + $window/2 >= $now) { $alive_count++; } elsif ($status{$peer} + $window < $now) { $dead_count++; } else { $unknown_count++; } } return ($dead_count, $alive_count, $unknown_count); } sub recent_cluster { my $cmd = shift; my $res = shift; $res = "all" if (!$res || $res =~ m/,/); my $hosts = shift || "*"; my ($dead_count, $alive_count, $unknown_count) = is_cluster_recent($cmd, $res, $hosts); return 1 if (!$dead_count && !$unknown_count); wait_cluster($cmd, $res, $hosts, 0); return 0; } # Newer kernel modules do no longer work on non-member resources # by defaults, for scalability reasons. # Activate such a resource as a guest. sub _activate_resource { my ($cmd, $res, $peer) = @_; $peer = $host unless $peer; return if $peer =~ $match_reserved_id; finish_links(); if (!$res || $res eq "*" || $res eq "all") { lwarn "no usable resource to activate ($res).\n"; return; } my $need_trigger = 0; my $resdir = "$mars/resource-$res"; my $max_retry = 3; until (-d $resdir) { my $condition = "return -d \"$resdir\";"; update_cluster($cmd, $res, "*", $condition); last if -d $resdir; if ($max_retry-- < 0) { ldie "Resource directory '$resdir' does not exist\n"; } lwarn "Resource directory '$resdir' does not yet exist, fetching...\n"; $need_trigger++; sleep(3); } lprint "OK, resource directory '$resdir' exists.\n"; my $check_link = "$resdir/size"; $max_retry = 3; until (get_link($check_link, 2)) { my $condition = "return get_link(\"$check_link\", 2);"; update_cluster($cmd, $res, "*", $condition); if ($max_retry-- < 0) { ldie "link '$check_link' does not exist.\n"; } lprint "link '$check_link' does not yet exist, fetching...\n"; $need_trigger++; sleep(3); } lprint "OK, link '$check_link' exists.\n"; my $active_path = "$resdir/device-$peer"; my $check = get_link($active_path, 2); if ($check) { if ($peer ne $real_host) { _push_check($peer, "", $resdir); _push_check($peer, "", $active_path); } # check end-to-end my $lnk = "$mars/resource-$res/actual-$peer/if-on"; my $val = get_link($lnk); if ($val ne "") { my $stamp = get_link_stamp($lnk, 2); my $end_to_end = is_recent($stamp); if ($end_to_end) { lprint "Resource '$res' was already activated at '$peer'\n"; return; } lwarn "Link '$lnk' has value '$val' as seen by '$real_host'\n"; } lwarn "Resource '$res' at '$peer' appears as non-activated from the viewpoint of '$real_host'\n"; } lprint "ENABLING resource '$res' at '$peer'\n"; # Ensure guest communication: a faked device-$peer must exist # for activation. # Notice: this may be later overwritten with another value. set_link($res, $active_path); # prevent fallback to old device detection my $actual_dir = "$resdir/actual-$peer"; mkdir($actual_dir); my $if_path = "$actual_dir/if-on"; set_link("0", $if_path); finish_links(); _push_check($peer, "", $active_path) if $peer ne $real_host; # wait for (self-)activation wait_cluster($cmd, $res, $peer, 0); # activated peers might have changed my $primary = _get_designated_primary($res, 1); for (my $retry = 3; $retry >= 0; $retry--) { _trigger(3); my $condition = "return _get_designated_primary(\"$res\", 1);"; my $from = ($primary && $primary ne "(none)") ? $primary : "*"; my $old_primary = $primary; $primary = update_cluster($cmd, $res, $from, $condition); last if ($primary && $primary eq $old_primary); sleep(3); } if (!$need_trigger && systemd_present($cmd, $res)) { lprint "using global trigger\n"; $need_trigger++; } systemd_any_trigger($cmd) if $need_trigger; } sub activate_guest { my ($cmd, $res, $peer) = @_; $peer = $host unless $peer; my $resdir = "$mars/resource-$res"; if ($cmd =~ m/deactivate/) { ldie "resource directory '$resdir' does not exist.\n" unless -d $resdir; my $check_glob = "$resdir/*{data,replay}-$peer"; if (lamport_glob($check_glob)) { ldie "peer '$peer' is a storage member of resource '$res'\n"; } my $guest_path = "$resdir/device-$peer"; _create_delete($guest_path); return; } _activate_resource($cmd, $res, $peer); ldie "Could not get resource name '$res'\n" unless -d $resdir; } ################################################################## # syntactic checks # (also check for existence) sub check_id { my ($str, $must_exist, $abort_reserved, $allow_specs) = @_; ldie "identifier '$str' is empty" unless defined($str) && $str; ldie "identifier '$str' has disallowed characters" unless $str =~ m/^[A-Za-z0-9_][-A-Za-z0-9_]*$/; ldie "identifier '$str' is too long (only 63 chars supported according to RFC 1123)" if length($str) > 63; if ($str =~ $match_reserved_id) { if ($allow_specs && $str eq "all") { lhint "using specifier '$str'\n" if $verbose > 2; } elsif (!$abort_reserved) { lwarn "identifier '$str' is a RESERVED special name.\n"; } elsif ($abort_reserved < 0) { lhint "using reserved name '$str'\n" if $verbose > 2; } else { lwarn "identifier '$str' is a RESERVED special name: forbidden here.\n"; if (!$force) { ldie "reserved names are dangerous.\n"; } else { lwarn "due to --force, I continue with reserved name '$str' AT YOUR RISK.\n"; } } } if (defined($must_exist) && $must_exist) { my $ip_path = "$mars/ips/ip-$str"; unless (get_link($ip_path, 1)) { if (!$force) { lwarn "Host '$str' does not exist in $mars/ips/\n"; ldie "When you know what you are doing, you may fix any typos and then use --force\n"; } lwarn "host '$str' does not exist in $mars/ips/ - hopefully you know what you are doing via --force\n"; } } } sub check_id_list { my ($str, $must_exist, $abort_reserved, $allow_specs) = @_; ldie "comma-separated list '$str' has disallowed characters" unless $str =~ m/^[A-Za-z0-9_][-A-Za-z0-9_,]*$/; foreach my $id (split(",", $str)) { check_id($id, $must_exist, $abort_reserved, $allow_specs); } } ################################################################## # semantic checks sub check_res { my $res = shift; wait_cluster("wait-cluster", "all", "*", 0) unless -d "$mars/resource-$res"; if (not -d "$mars/resource-$res") { # Ensure that all cluster information is present # DO WHAT I MEAN: try to substitute a device name for a badly given resource name if it is unique my $count = 0; my $found; my @tests = lamport_glob("$mars/resource-*/device-$host"); foreach my $test (@tests) { my $target = get_link($test, 2); if ($target eq $res) { $found = $test; $count++; } } if (!$count) { @tests = lamport_glob("$mars/resource-*/_direct-*-$host"); foreach my $test (@tests) { my $target = get_link($test, 2); $target =~ s/^.*,//; if ($target eq $res) { $found = $test; $count++; } } } ldie "resource '$res' does not exist ($count replacements found)\n" unless $count == 1 and $found; $found =~ s:^.*/resource-(.*)/.*$:$1:; lwarn "substituting bad resource name '$res' by uniquely matching resource name '$found'\n"; $res = $found; } return $res; } sub _get_mars_size { my ($cmd, $res) = @_; my $dev_name = get_link("$mars/resource-$res/device-$host"); my $info = "/sys/devices/virtual/block/mars!$dev_name/size"; return `cat $info` * 512; } sub check_sizes { my ($res, $peer) = @_; my $physical_size = get_link("$mars/resource-$res/actsize-$peer", 2) || return; my $logical_size = get_link("$mars/resource-$res/size", 1); if (defined($physical_size) && $physical_size < $logical_size) { lwarn "Physical device on host '$peer' has size $physical_size, which is smaller than the logical resource size $logical_size\n"; ldie "This is too dangerous. It cannot work. Fix it!\n" unless $force; } } sub check_res_member { my ($cmd, $res) = @_; if (! link_exists("$mars/resource-$res/data-$host")) { if (link_exists("$mars/resource-$res/device-$host")) { # guest return; } if (link_exists("$mars/resource-$res/replay-$host")) { lwarn "Resource '$res' seems to have been destroyed.\n"; lwarn "Nevertheless, a replay link exists for host '$host'.\n"; lwarn "This can happen after 'leave-resource --host=$host' while host $host was active.\n"; lwarn "mars-user-manual.pdf forbids any usage of such a resource _strongly_.\n"; lwarn "In order to finally remove this resource from $host, use the\n"; lwarn "command 'marsadm leave-resource --force $res'\n"; } else { lwarn "Sorry, I have not yet joined to resource '$res'\n"; } ldie "Refusing work on resource name '$res'\n" unless $force; lwarn "Running '$cmd' is dangerous, continuing on your own risk\n" unless $cmd eq "leave-resource"; } check_sizes($res, $host); } sub _sync_finished { my ($res, $peer) = @_; my $lnk = "$mars/resource-$res/syncstatus-$peer"; return 0 unless link_exists($lnk); my $syncstatus = get_link($lnk, 1); my $size = get_link("$mars/resource-$res/size"); return 0 if ($size <= 0); return 0 if ($syncstatus < $size); my $syncpos = get_link("$mars/resource-$res/syncpos-$peer", 2); if (defined($syncpos) && $syncpos) { my $replay = get_link("$mars/resource-$res/replay-$peer", 1); return 0 unless $replay; my $cmp = compare_replaylinks($syncpos, $replay); return 0 if $cmp > 0; } return 1; } sub check_sync_finished { my ($res, $peer, $do_force) = @_; check_sizes(@_); my $lnk = "$mars/resource-$res/actual-$peer/is-primary"; my $is_recent = is_link_recent($lnk, $window); my $is_primary = get_link($lnk, 1); if (defined($is_primary) && $is_primary) { # By definition, a primary does not need to sync if ($is_recent) { return; } } if (!$is_recent) { lwarn "Cannot _reliably_ determine the current role of '$peer'\n"; lwarn "Apparently, '$peer' _seems_ to be in primary role.\n" if $is_primary; } if (!_sync_finished($res, $peer)) { lwarn "Sync has not yet finished on host '$peer'\n"; if ($peer eq $host) { lwarn "Don't try to make inconsistent host '$host' the new primary!\n"; lwarn "Please wait until sync has finished and all logfile have been replayed.\n"; ldie "Refusing to switch inconsistent host '$host' to primary\n"; } else { lwarn "Changing the primary role during sync is dangerous for data consistency on host '$peer'!\n"; } unless ($do_force) { lprint "HINT: consider the option --ignore-sync if you are sure that you want to restart the sync\n"; ldie "First wait for sync finished before primary handover, or use --ignore-sync\n"; } } lprint "OK, it seems that sync has finished on host '$peer'.\n"; } sub check_primary { my ($cmd, $res, $no_designated, $no_fail, $no_warn) = @_; my $fail = 0; my $lnk = "$mars/resource-$res/actual-$host/is-primary"; my $is_primary = get_link($lnk, 1); if (!$is_primary) { # give it a second chance $is_primary = device_exists($res); } unless ($is_primary) { lwarn "For operation '$cmd' I need to be primary\n" unless $no_warn; $fail++; } if ($no_designated) { my $primary = _get_designated_primary($res); unless ($primary eq $host) { lwarn "For operation '$cmd', I also must be the designated primary\n" unless $no_warn; } } unless (defined($no_fail) && $no_fail) { ldie "Operation '$cmd' only works at the primary side\n" if $fail; } return $fail; } sub check_not_primary { my ($cmd, $res, $allow_force) = @_; my $lnk = "$mars/resource-$res/actual-$host/is-primary"; my $is_primary = get_link($lnk, 1); if ($is_primary) { if (!$force || !defined($allow_force) || !$allow_force) { ldie "operation '$cmd' cannot be executed on actual primary\n"; } lwarn "operation '$cmd' is forced on actual primary '$host', THIS IS RISKY\n"; } # also check whether we intend to become primary my $primary = _get_designated_primary($res, 1); if ($primary eq $host) { lwarn "operation '$cmd' cannot be executed on designated primary\n"; ldie "first switch the designated primary, or use --force if you are sure.\n" unless $force; lwarn "continuing anyway due to --force\n"; } } sub check_primary_gone { my ($cmd, $res, $peer) = @_; return 0 unless (defined($peer) && $peer && $peer ne "(none)"); return 0 if $peer eq $host; my $lnk = "$mars/resource-$res/actual-$peer/is-primary"; my $is_primary = get_link($lnk, 1); if (defined($is_primary) && $is_primary eq "0") { lprint "OK, other host ($peer) is not actual primary.\n"; return 0; } lprint "waiting for other primary host ($peer) to disappear....\n"; return 1; } sub _make_messages { my ($cmd, $res, $key, $val, $wait, $unchecked, $inv) = @_; my %table = ( "is-attached" => "attach", "is-syncing" => "sync", "is-copying" => "fetch", "is-replaying" => "replay", "is-primary" => "primary", ); my $key_msg = defined($table{$key}) ? $table{$key} : $key; my $val_msg = $val ? "on" : "off"; my $wait_msg = $wait ? "waiting $timeout until" : "ensuring that"; my $new_val = (defined($inv) && $inv) ? !$val : $val; my $action_msg = ($new_val ? "'resume-" : "'pause-") . $key_msg . "'"; if ($key_msg eq "primary") { $action_msg = $new_val ? "'$key_msg'" : "'secondary' (or better 'primary' on another host)"; } elsif ($key_msg eq "attach") { $action_msg = $new_val ? "'$key_msg'" : "'detach'"; } $action_msg = "Use $action_msg and/or wait until it has really succeeded."; $action_msg .= " Notice that MARS is an ASYNCHRONOUS system, where both execution of actions, as well as propagation over the network may take some time." if $verbose; $action_msg .= " Don't expect magic to happen if the network has some problems, or when the system load is extremely high." if $verbose > 1; $action_msg .= " Please read the PDF manual, and try to understand it." if $verbose > 2; return ($key_msg, $val_msg, $wait_msg, $action_msg); } sub check_todo { my ($cmd, $res, $key, $val, $wait, $unchecked, $inv) = @_; $key =~ s/fetch/connect/; my $path = "$mars/resource-$res/todo-$host/$key"; $path = correct_path($path); my $link; for (;;) { $link = get_link($path, $unchecked); return unless defined($link); $link = ($link eq $host) ? 1 : 0 if $key eq "primary"; $link = ($link eq "(none)") ? 1 : 0 if $key eq "secondary"; my ($key_msg, $val_msg, $wait_msg, $action_msg) = _make_messages(@_); if (defined($inv) && $inv) { last if $link != $val; lprint "$wait_msg switch '$key' != '$val'....\n"; ldie "Cannot execute $cmd on resource $res: todo-switch '$key_msg' must not be $val_msg. $action_msg\n" if !$wait; } else { last if $link == $val; lprint "$wait_msg switch '$key' == '$val'....\n"; ldie "Cannot execute $cmd on resource $res: todo-switch '$key_msg' must be $val_msg, but actually has value '$link'. $action_msg\n" if !$wait; } sleep_timeout(); } lprint "OK, '$path' has acceptable value '$link'\n"; } sub _run_compensation { my ($compensation) = @_; if (defined($compensation)) { my $txt = $compensation; $txt =~ s/\n/\\n/mg; lprint "Running compensation action '$txt'\n"; eval $compensation; } } sub check_status { my ($cmd, $res, $key, $val, $wait, $unchecked, $inv, $peer, $action, $compensation) = @_; $peer = $host unless defined($peer); my $path = correct_path("$mars/resource-$res/actual-$peer/$key"); my $link; my $rounds = 0; my $fail_round = 10; for (;;) { $link = get_link($path, $unchecked); $link = 0 unless (defined($link) && $link ne ""); my ($key_msg, $val_msg, $wait_msg, $action_msg) = _make_messages(@_); if (defined($inv) && $inv) { last if $link != $val; lprint "at $peer: $wait_msg actual '$key' != '$val'...\n"; ldie "Cannot execute $cmd on resource $res: actual '$key_msg' must not be $val_msg. $action_msg Also ensure that your command _can_ succeed.\n" if !$wait; } else { last if $link == $val; lprint "at $peer: $wait_msg actual '$key' == '$val'...\n"; ldie "Cannot execute $cmd on resource $res: actual '$key_msg' must be $val_msg. $action_msg Also ensure that your command _can_ succeed.\n" if !$wait; } if (defined($action) && $action && $rounds > 1) { lprint "Action: $action\n" if $verbose > 2; my $action_status = 0; my $old_error_count = $error_count; eval "$action"; $error_count = $old_error_count; # Tolerate intermediate failures for some time if ($action_status && $rounds > $fail_round) { lwarn "failed action $action_status: $action\n"; _run_compensation($compensation); ldie "Action failure, status=$action_status\n"; } } my $status = sleep_timeout(undef, 1); if ($status) { _run_compensation($compensation); ldie "Timeout\n"; } $rounds++; } lprint "OK at $peer: '$path' has acceptable value '$link'\n"; } sub check_mars_device { my ($cmd, $res, $wait, $inv) = @_; my $dev = device_name($res); my $backoff = 1; my $round = 0; if ($inv) { while (device_exists($res)) { ldie "cannot execute $cmd: device '$dev' has not yet disappeared\n" if !$wait; lwarn "device '$dev' has not yet disappeared\n"; sleep_timeout($backoff); # very slowly increasing backoff if ($backoff < 10 && $round++ > 5) { $round = 0; $backoff++; } systemd_any_trigger($cmd, $res); } lprint "device '$dev' is no longer present\n" unless device_exists($res); return; } # !$inv my $primary = _get_designated_primary($res); ldie "for operation '$cmd', I should be the designated primary\n" unless $primary eq $host; while (!device_exists($res)) { my $text = get_error_text($cmd, $res); lprint $text if $text; ldie "aborting due to errors\n" if $text =~ m/error/mi; ldie "cannot execute $cmd: device '$dev' not yet present\n" if !$wait; lprint "device '$dev' not yet present\n"; sleep_timeout($backoff); # very slowly increasing backoff if ($backoff < 10 && $round++ > 5) { $round = 0; $backoff++; } } lprint "device '$dev' is present\n" if device_exists($res); } sub check_userspace { my ($dst) = @_; if ($dst !~ m:/userspace/:) { ldie "your path '$dst' must be inside $mars/userspace/ or $mars/resource-*/userspace/\n" unless $force; lwarn "your path '$dst' is outside $mars/userspace/ or $mars/resource-*/userspace/ and you gave --force, hopefully YOU KNOW WHAT YOU ARE DOING\n"; } } sub check_sync_startable { my ($cmd, $res) = @_; my $primary = _get_designated_primary($res); ldie "Cannot execute '$cmd' because noone is designated as primary.\n" if (!$primary || $primary eq "(none)"); # no danger when switch is turned on at the primary side. return if $primary eq $host; my $emergency_path = "$mars/resource-$res/actual-$primary/has-emergency"; my $emergency = get_link($emergency_path, 1); if ($emergency) { ldie "Primary '$primary' is in emergency mode. Cannot start sync.\nFree some space there first.\n"; } } ################################################################## # state inspection routines sub _get_minmax { my ($res, $glob, $take_symlink) = @_; my $min = -1; my $max = -1; my @paths = lamport_glob($glob) or lwarn "cannot find '$glob'\n"; foreach my $path (@paths) { my $nr = $path; if ($take_symlink) { $nr = get_link($path, 1); } $nr =~ s@^(?:.*/)?[a-z]+-([0-9]+)(-[^/]*)?$@$1@; $min = $nr if ($nr < $min || $min < 0); $max = $nr if ($nr > $max || $max < 0); } return ($min, $max); } sub get_minmax_logfiles { my ($res, $peer) = @_; $peer = "" unless defined($peer); return _get_minmax($res, "$mars/resource-$res/log-*$peer", 0); } sub get_minmax_versions { my ($res, $peer) = @_; $peer = "" unless defined($peer); return _get_minmax($res, "$mars/resource-$res/version-*$peer", 0); } sub get_minmax_replays { my ($res, $peer) = @_; $peer = "" unless defined($peer); return _get_minmax($res, "$mars/resource-$res/replay-*$peer", 1); } ################################################################## # generic comparisons sub compare_replaylinks { my ($a, $b) = @_; $a =~ m/log-([0-9]+)[^,]*,([0-9]+)/; my ($a_log, $a_pos) = ($1, $2); $b =~ m/log-([0-9]+)[^,]*,([0-9]+)/; my ($b_log, $b_pos) = ($1, $2); return -1 if $a_log < $b_log; return +1 if $a_log > $b_log; return -1 if $a_pos < $b_pos; return +1 if $a_pos > $b_pos; return 0; } ################################################################## sub get_amount { my ($resdir, $host, $direction, $only_files) = @_; my $level = 0; my $firstpos = 0; my $logpos = 0; my $oldpos = 0; my $sum = 0; for (;;) { my $val0 = 0; my $ok = 0; if (!$level) { my $replay_path = sprintf("%s/replay-%s", $resdir, $host); my $replay_link = get_link($replay_path, 1); return (0, 0, 0, 0) if !$replay_link; return (0, 0, 0, 0) if $replay_link !~ m:log-([0-9]+)-[^,]+,([0-9]+),([0-9]+):; $firstpos = int($1); $logpos = $firstpos; $oldpos = $firstpos; if ($direction < 0) { $sum = $2; } else { $val0 = $2; $sum = $3; } } if ($level > 0 || $direction > 0) { my $file = sprintf("%s/log-%09d-%s", $resdir, $logpos, $host); my @stat = stat($file); my $val = 0; if (@stat && file_exists($file)) { $val = $stat[7]; $ok = 1; } else { my $glob = sprintf("%s/log-%09d-*", $resdir, $logpos); foreach $file (lamport_glob($glob)) { my @tstat = stat($file); if (@tstat && $tstat[7] > $val) { @stat = @tstat; $val = $stat[7]; $ok = 1; } } } if (!$only_files) { my $glob = sprintf("%s/version-%09d-*", $resdir, $logpos); foreach $file (lamport_glob($glob)) { my $vers_link = get_link($file, 1); if ($vers_link && $vers_link =~ m;,([0-9]+):;) { my $nval = $1; $val = $nval if $nval > $val; $ok = 1; } } } return ($sum, $firstpos, $oldpos, $level) unless $ok; if ($level > 0) { $sum += $val; } else { # the logfile may be bigger than the replay pos/length my $new_sum = $val - $val0; $sum = $new_sum if $new_sum > $sum; $sum = 0 if $sum < 0; } } $oldpos = $logpos; if ($direction > 0) { $logpos++; } elsif ($direction < 0) { $logpos--; return ($sum, $firstpos, $oldpos, $level) if $logpos < 1; } else { return ($sum, $firstpos, $oldpos, $level); } $level++; } } ################################################################## # versionlink path handling routines my %visited_pos; sub _visit { my ($nr, $peer) = @_; $nr =~ s:^0*::; my $visit = "$nr,$peer"; $visited_pos{$visit} = 1; } sub _is_visited { my ($nr, $peer) = @_; $nr =~ s:^0*::; my $visit = "$nr,$peer"; return $visited_pos{$visit}; } sub _parse_pos { my ($basedir, $pos) = @_; if ($pos =~ m/log-([0-9]+)-([^,]+)/) { _visit($1, $2); } elsif ($pos =~ m/version-([0-9]+)-([^,]+)/p) { my $vers = get_link("$basedir/$MATCH"); my $count_matches = 0; while ($vers =~ m/log-([0-9]+)-([^,]+)/p) { $vers = $POSTMATCH; _visit($1, $2); $count_matches++; } lwarn "cannot parse '$pos' -> '$vers'\n" unless $count_matches; } else { lwarn "cannot parse '$pos'\n"; } $pos =~ m/((?:log|version)-([0-9]+)-([^,]+)(?:,([0-9]+))?)/ or lwarn "cannot parse position info '$pos'\n"; return ($1, int($2), $3, defined($4) ? int($4) : -1); } sub _get_prev_pos { my ($basedir, $nr, $peer) = @_; my $path = sprintf("version-%09d-$peer", $nr); my $vers = get_link("$basedir/$path", 2); if (!defined($vers) || !$vers) { # Scarce race. # This can happen when new logfiles are present but not yet worked on. # Also improves robustness on damaged filesystems. # Decrement nr by hand, try, take the first part. $path = sprintf("version-%09d-$peer", $nr - 1); $vers = get_link("$basedir/$path", 2); $vers =~ s/:.*// if defined($vers) && $vers; } else { # take the last part, pointing to the predecessor versionlink. $vers =~ s/^.*://; } _parse_pos($basedir, $path) if defined($vers) && $vers; return $vers; } sub _get_common_ancestor { my ($total1, $total2) = (0, 0); for (;;) { my ($basedir, $pos1, $host1, $dep1, $pos2, $host2, $dep2) = @_; my ($p1, $nr1, $from1, $len1) = _parse_pos($basedir, $pos1); my ($p2, $nr2, $from2, $len2) = _parse_pos($basedir, $pos2); if ($p1 eq $p2) { # usually no split brain here (only if both path depths are non-zero) my $split = ($dep1 && $dep2); if (!$split) { # additionally check the corresponding version links my $path1 = sprintf("$basedir/version-%09d-$from1", $nr1); my $path2 = sprintf("$basedir/version-%09d-$from2", $nr2); if (my $vers1 = get_link($path1, 1) and my $vers2 = get_link($path2, 1)) { if ($vers1 ne $vers2) { $split = 1; $total1 += $len1; $total2 += $len2; } } } return ($p1, $split, $total1, $total2); } elsif ($nr1 > $nr2) { # just flip arguments @_ = ($basedir, $pos2, $host2, $dep2, $pos1, $host1, $dep1); ($total1, $total2) = ($total2, $total1); next; } elsif ($nr1 < $nr2) { # recursively advance path depth $total2 += $len2; my $vers2 = _get_prev_pos($basedir, $nr2, $host2); return ("", -1, $total1, $total2) if !$vers2; @_ = ($basedir, $pos1, $host1, $dep1, $vers2, $host2, $dep2 + 1); next; } elsif ($from1 ne $from2) { # split brain is sure now, but continue computing the common split point my $vers1 = _get_prev_pos($basedir, $nr1, $host1); return ("", 1, $total1 + $len1, $total2 + $len2) if !$vers1; my $vers2 = _get_prev_pos($basedir, $nr2, $host2); return ("", 1, $total1 + $len1, $total2 + $len2) if !$vers2; my ($res, $split, $t1, $t2) = _get_common_ancestor($basedir, $vers1, $host1, $dep1 + 1, $vers2, $host2, $dep2 + 1); return ($res, 1, $t1 + $total1 + $len1, $t2 + $total2 + $len2); } elsif ($len1 < $len2) { # there may be no split brain (just incomplete replay) depending on path depth return ($p1, $dep1, $total1 + $len1, $total2 + $len2); } elsif ($len2 < $len1) { # dto symmetric return ($p2, $dep2, $total1 + $len1, $total2 + $len2); } lwarn "error in algorithm: $p1, $nr1, $from1, $len1 : $p2, $nr2, $from2, $len2\n"; return ("", -1, 0, 0); } } sub get_common_ancestor { my ($basedir, $host1, $host2) = @_; my $repl1 = get_link("$basedir/replay-$host1", 1); my $repl2 = get_link("$basedir/replay-$host2", 1); return _get_common_ancestor($basedir, $repl1, $host1, 0, $repl2, $host2, 0); } my %detected_splits = (); sub detect_splitbrain { my ($res, $do_report) = @_; # dynamic programming return $detected_splits{$res} if defined($detected_splits{$res}); my $basedir = "$mars/resource-$res"; my $ok = 1; my %alive_host_stamps; my %involved_logfiles; my %involved_hosts; my $is_likely_false_positive = 0; my $i_am_primary = 0; my $is_likely_outdated = 0; my @list = lamport_glob("$mars/resource-$res/replay-*"); my @hosts = map { $_ =~ s:.*/replay-::; $_ } @list; foreach my $host1 (@hosts) { $alive_host_stamps{$host1} = get_alive_link("time", $host1, 1); if (!is_recent($alive_host_stamps{$host1}, $window)) { $is_likely_outdated++; } foreach my $host2 (@hosts) { # reduce O(k^2) by some factor next if $host1 ge $host2; my $re_checked = 0; my ($point, $split, $size1, $size2); for (my $retry = 2; $retry > 0; $retry--) { ($point, $split, $size1, $size2) = get_common_ancestor($basedir, $host1, $host2); last if !$split; # Workaround races. sleep(0); } re_check: if ($split) { $ok = 0; if ($do_report) { if ($point) { my $log = "$basedir/$point"; $log =~ s:,.+::; my $stamp = get_link_stamp($log); my $vers = $log; $vers =~ s:/log-:/version-:; my $stamp2 = get_link_stamp($vers); # take the minimum $stamp = $stamp2 if !$stamp || ($stamp2 && $stamp2 < $stamp); # for safety, report the eldest timestamp if (!$involved_logfiles{$point} || $stamp < $involved_logfiles{$point}) { $involved_logfiles{$point} = $stamp; } } elsif (!defined($involved_logfiles{$point})) { $involved_logfiles{$point} = 0; } # for safety, report the biggest size if (!defined($involved_hosts{$host1}) || $size1 > $involved_hosts{$host1}) { $involved_hosts{$host1} = $size1; } if (!defined($involved_hosts{$host2}) || $size2 > $involved_hosts{$host2}) { $involved_hosts{$host2} = $size2; } } if (!$re_checked++) { # Safeguard against unknown roles (primary vs secondary) # Also try the same pair in reverse roles, the result _might_ be different # due to some mathematical reasons ;) ($point, $split, $size2, $size1) = get_common_ancestor($basedir, $host2, $host1); goto re_check; } } elsif ($host1 eq $real_host || $host2 eq $real_host) { # try to filter out false positives foreach my $other ($host1, $host2) { my $lnk = "$mars/resource-$res/actual-$other/is-primary"; my $is_primary = get_link($lnk, 2); if ($is_primary && is_link_recent($lnk, $window)) { if ($other eq $real_host) { $i_am_primary++; } else { $is_likely_false_positive++; } } } } } } # report any relevant logfile split-points if (%involved_logfiles) { # check whether kernel module is loaded my $stamp = $alive_host_stamps{$real_host}; $stamp = 0 unless defined($stamp); if (!$stamp) { lhint " the following warnings may result from defective hardware / network.\n"; } elsif (!is_recent($stamp, $window)) { lhint " the following warnings are likely OUTDATED by ~" . seconds2human(mars_time() - $stamp) . "\n"; } elsif ($is_likely_outdated) { my $cnt; if ($is_likely_outdated == 1) { $cnt = "1 host is"; } else { $cnt = "$is_likely_outdated host infos are"; } lhint " $cnt likely OUTDATED and MIGHT be FALSE POSITIVE\n"; } foreach my $point (sort alphanum_cmp keys(%involved_logfiles)) { my $stamp = $involved_logfiles{$point}; my $age = ""; if ($stamp) { $age = " age ~" . seconds2human(mars_time() - $stamp); } lwarn "SPLIT BRAIN of resource '$res' after logfile '$point'$age\n"; } } # report involved peer status (as far as known) if (%involved_hosts) { my $nr = 0; foreach my $peer (sort alphanum_cmp keys(%involved_hosts)) { $nr++; my $size = $involved_hosts{$peer}; my $txt = ""; my $lnk = "$mars/resource-$res/actual-$peer/is-primary"; my $is_primary = get_link($lnk, 1); if ($is_primary) { if ($peer eq $real_host) { $txt = " primary"; } else { $txt = " reported-primary"; } } elsif (!defined($is_primary) || $is_primary eq "") { $txt = " unknown-role"; } my $peer_ip = get_link("$mars/ips/ip-$peer", 1); my $stamp = $alive_host_stamps{$peer}; if (!defined($peer_ip) || $peer_ip eq "") { $txt .= " unconfigured IP"; } elsif (!defined($stamp) || $stamp eq "") { $txt .= " unreachable"; } elsif (!is_recent($stamp, $window)) { $txt .= " interrupted ~" . seconds2human(mars_time() - $stamp); } if ($peer eq $real_host) { if ($i_am_primary) { lhint " host$nr = '$peer'$txt, my PRIMARY logfile is always UPTODATE by definition.\n"; next; } elsif ($is_likely_false_positive) { lhint " the following warning MAY be a FALSE POSITIVE (but I cannot determine the truth over network)\n"; } } if (defined($size) && $size > 0) { lwarn " host$nr = '$peer'$txt logfile_amount='$size' (" . number2human($size) . ")\n"; } else { lwarn " host$nr = '$peer'$txt (unknown logfile size and age)\n"; } } } if ($ok) { # check for duplicate logfiles my @logs = lamport_glob("$mars/resource-$res/log-*"); my $oldnr = -1; foreach my $path (sort alphanum_cmp @logs) { $path =~ m:/log-([0-9]+):; my $nr = $1; if ($nr == $oldnr) { $ok = 0; lwarn "SPLIT BRAIN at resource '$res' detected: duplicate logfile number $nr\n"; lhint "first resolve split brain by 'leave-resource' or 'invalidate'\n"; lhint "if this does not help, try cleanup via 'log-purge-all'\n"; lhint "if this does not help, try 'log-purge-all --force'\n"; last; } $oldnr = $nr; } } $detected_splits{$res} = $ok; return $ok; } sub _mark_path_backward { my ($basedir, $pos, $peer, $skip, $jump_peer) = @_; my $sum = 0; my $base_nr = 0; for (;;) { my ($p, $nr, $from, $len) = _parse_pos($basedir, $pos); last if defined($skip) && $nr < $skip; $base_nr = $nr; _visit($nr, $peer); # When following chains from foreign hosts (e.g. the designated primary), # we must jump over to our own chain somewhen, because the lengths of # the chains may be different (caused by invalidate & friends). if (defined($jump_peer) && $jump_peer ne $peer) { my $peer_path = sprintf("$basedir/version-%09d-$peer", $nr); my $jump_path = sprintf("$basedir/version-%09d-$jump_peer", $nr); my $peer_version = get_link($peer_path, 2); my $jump_version = get_link($jump_path, 2); if (defined($peer_version) && defined($jump_version) && $peer_version eq $jump_version) { $peer = $jump_peer; } } $pos = _get_prev_pos($basedir, $nr, $peer, 1); last if !$pos; # optionally don't count the last versionlink, pointing into nirvana if (defined($skip) && $skip && $nr > 1) { my ($p, $nr, $from, $len) = _parse_pos($basedir, $pos); last if !$p; my $next = _get_prev_pos($basedir, $nr, $peer, 1); last if !$next; } $sum += $len; } return ($sum, $base_nr); } sub _mark_path_forward { my ($basedir, $pos, $peer) = @_; my @list = ($pos); while (@list) { my %next_list; foreach $pos (@list) { my ($p, $nr, $from, $len) = _parse_pos($basedir, $pos); my $cand = sprintf("$basedir/version-%09d-$peer", $nr + 1); my $vers = get_link($cand, 2); next unless defined($vers) && $vers ne ""; $vers =~ s/^.*://; my ($cp, $cnr, $cfrom, $clen) = _parse_pos($basedir, $vers); if (int($cnr) == int($nr) && $cfrom eq $from && $clen == $len) { $next_list{$cand} = 1; } } @list = keys(%next_list); } } sub _mark_path_transitive { _mark_path_forward(@_); _mark_path_backward(@_); } sub log_purge_res { my ($cmd, $res) = @_; lwarn "DANGEROUS OPERATION: $cmd --force on resource '$res'\n" if $force; %visited_pos = (); my %logs; my %start_logs; my $basedir = "$mars/resource-$res"; my $max_retry = 3; retry: my $start_count = 0; my %situation = (); foreach my $data (lamport_glob("$basedir/{data,replay}-*")) { $data =~ m:/(data|replay)-(.+):; my $peer = $2; my $replay = "$basedir/replay-$peer"; my $target = get_link($replay, 1); next unless $target; lprint "found replay link '$replay' -> '$target'\n"; # only the logfile number is relevant, not the positions $target =~ s/,.*//; $situation{$replay} = $target; $start_logs{$target}++; $start_count++; _mark_path_transitive($basedir, $target, $peer); } if (!$start_count) { lprint "Resource '$res' contains no valid information - there is nothing to purge\n"; return; } my %to_delete = (); foreach my $file (lamport_glob("$basedir/version-*")) { $file =~ m:/(version-([0-9]+)-([^,]+)): or ldie "bad path '$file'\n"; my $cand = $1; my $nr = $2; my $from = $3; lprint "checking '$cand'\n"; my $vers = get_link($file, 1); $vers =~ m/(log-[0-9]+-[^,:]+)/; my $log = $1; lprint " corresponding logfile is '$log'\n"; if (_is_visited($nr, $from)) { lprint " ok '$cand'\n"; $logs{$log}++; next; } if (!$force && $from ne $host) { lprint " skipping foreign object '$cand'\n"; $logs{$log}++; next; } lwarn "deleting foreign object from peer '$from' because you said --force\n" if $from ne $host; $to_delete{$file}++; } foreach my $file (lamport_glob("$basedir/log-*")) { $file =~ m:/(log-[0-9]+-(.*)): or ldie "bad path '$file'\n"; my $log = $1; my $from = $2; lprint "checking '$log'\n"; if ($logs{$log}) { lprint " ok '$log'\n"; $logs{$log} = -1; next; } if ($start_logs{$log}) { lprint " ok start '$log'\n"; $logs{$log} = -1; next; } if (!$force && is_link_recent($log)) { lprint " skipping recent object '$log'\n"; next; } if (!$force && $from ne $host) { lprint " skipping foreign object '$log'\n"; next; } lwarn "deleting foreign object from peer '$from' because you said --force\n" if $from ne $host; $to_delete{$file}++; } # check for any races in the initial situation my $nr_races = 0; foreach my $replay (sort keys(%situation)) { my $old_situation = $situation{$replay}; my $target = get_link($replay, 1); # only the logfile number is relevant, not the positions $target =~ s/,.*//; if (!$target || $target ne $old_situation) { lwarn "Race on '$replay' -> '$target' instead of '$old_situation'\n"; $nr_races++; } } if ($nr_races) { if ($max_retry-- > 0) { lwarn "Restarting due to $nr_races races\n"; sleep(1); goto retry; } ldie "Detected $nr_races, the situation is not stable\n"; } # POINT OF NO RETURN foreach my $file (sort keys(%to_delete)) { _create_delete($file); } my $count = 0; foreach my $log (sort alphanum_cmp keys(%logs)) { my $nr = $logs{$log}; next if $nr < 0 || any_exists("$basedir/$log"); lprint_stderr "info: logfile '$log' is referenced ($nr), but not present.\n"; $count++; } if ($count) { lprint_stderr " Unreferenced logfiles are not necessarily bad.\n"; lprint_stderr " They can regularly appear after 'leave-resource',\n"; lprint_stderr " or 'invalidate', or after emergency mode,\n"; lprint_stderr " or after similar operations.\n"; } finish_links(); } sub err_purge_res { my ($cmd, $res) = @_; foreach my $leftlink (lamport_glob("$mars/{,resource-$res/}actual-$host/msg-err-*")) { lprint " deleting '$leftlink'\n" if $verbose; _create_delete($leftlink); } finish_links(); } sub try_to_avoid_splitbrain { my ($cmd, $res, $old_primary) = @_; my $old_timeout = $timeout; $old_primary = "" if $old_primary eq "(none)"; if (!detect_splitbrain($res, 0)) { lwarn "ATTENTION: you are starting a non-forced primary switchover in a split brain situation.\n"; lwarn "ATTENTION: that's no good idea.\n"; lwarn "ATTENTION: I will continue to do what you want.\n"; lwarn "ATTENTION: But you are responsible for the consequences.\n"; return 0; } # now try to prevent producing a _new_ split brain situation.... my @host_list = lamport_glob("$mars/resource-$res/replay-*"); return 0 if scalar(@host_list) < 2; my ($min, $max) = get_minmax_versions($res); my $vers_glob = sprintf("$mars/resource-$res/version-%09d-*", $max); my $ok = 1; my $replay_err_path = "$mars/resource-$res/actual-$host/msg-err-replay-stop"; my $replay_err = get_link($replay_err_path, 1); if ($replay_err && $replay_err ne "OK") { my @stat = lstat($replay_err_path); if ($stat[9] + $window >= mars_time()) { my $msg = _get_text($replay_err); lwarn "cannot guarantee split brain avoidance: $msg\n"; return 0; } } my $emergency_path = "$mars/resource-$res/actual-$host/has-emergency"; my $emergency = get_link($emergency_path, 1); if ($emergency) { ldie "emergency mode $emergency has been entered locally: handover is not possible. Either free some space in $mars/, or use --force to use a potentially outdated version.\n"; } my $primary = _get_designated_primary($res, -1); if ($primary eq "(none)") { # try to determine the old primary when unique my $glob_logs = sprintf("$mars/resource-$res/log-%09d-*", $max); my @candidates = lamport_glob($glob_logs); if (scalar(@candidates) == 1) { my $log_path = pop @candidates; if ($log_path =~ m:/log-[0-9]+-(.+)$:) { $primary = $1; lprint "Using last primary '$primary' as a substitute.\n"; } } } if ($primary && $primary eq $host) { lprint "Switching back to last primary.\n"; return 0; } # if the old primary is known, we can ignore all other / unrelated hosts if ($primary && $primary ne $host && $primary ne "(none)") { my $path_p = "$mars/resource-$res/replay-$primary"; my $path_h = "$mars/resource-$res/replay-$host"; my $replay_p = get_link($path_p); my $replay_h = get_link($path_h); my $p_path = sprintf("$mars/resource-$res/version-%09d-%s", $max, $primary); my $h_path = sprintf("$mars/resource-$res/version-%09d-%s", $max, $host); my $p_vers = get_link($p_path, 1); my $h_vers = get_link($h_path, 1); if (!$p_vers || !$h_vers || $p_vers ne $h_vers) { # Safeguard against artificial or missing version links # (e.g. after crashes etc): # when replay links are equal, we are safe. my $stamp_p = get_link_stamp($path_p); my $stamp_h = get_link_stamp($path_h); my $p_stamp = get_link_stamp($p_path); my $stable_stamp = mars_time() - $window / 4; if ($replay_p eq $replay_h && $p_vers && $p_stamp < $stable_stamp && $stamp_p < $stable_stamp && $stamp_h < $stable_stamp) { # self-healing the versionlink lwarn "Correcting the versionlink '$h_path' from '$h_vers' to '$p_vers'\n"; set_link($p_vers, $h_path); finish_links(); } else { $ok = 0; } } elsif (!$replay_p || !$replay_h || $replay_p ne $replay_h) { $ok = 0; } } else { # old primary is unknown: we have no chance, other than comparing _all_ versions. my @versions = lamport_glob($vers_glob); my $first = get_link(shift @versions); while (@versions) { my $next = get_link(shift @versions); if ($next ne $first) { $ok = 0; } } } return 0 if $ok; lprint "Trying to avoid split brain for $timeout s: logfile update not yet completed.\n"; my $tpl = get_macro("replinfo"); my $new_situation = eval_macro($cmd, $res, $tpl, @_); print $new_situation; # condition not met return 1; } sub get_size { my $arg = shift; my $orig_arg = $arg; # Do what I mean: when given a device, take its actual size. if (-b $arg) { my $mangled = $arg; $mangled =~ s:/dev/::; $mangled =~ s:/:!:g; my $path = "/sys/block/$mangled/size"; $arg = `cat $path` * 512 if -r $path; } if ($arg !~ m/^([0-9]+(?:\.[0-9]*)?)([kmgtp]?)$/i) { ldie "Size argument '$arg' must be a number, optionally followed by a suffix [kKmMgGtTpP]. Lowercase = multiples of 1000, Uppercase = multiples of 1024.\n"; } my $mod = $2 || ""; $arg = $1; $_ = $mod; SWITCH: { /^$/ and last SWITCH; /^k$/ and $arg *= 1000, last SWITCH; /^m$/ and $arg *= 1000 * 1000, last SWITCH; /^g$/ and $arg *= 1000 * 1000 * 1000, last SWITCH; /^t$/ and $arg *= 1000 * 1000 * 1000 * 1000, last SWITCH; /^p$/ and $arg *= 1000 * 1000 * 1000 * 1000 * 1000, last SWITCH; /^K$/ and $arg *= 1024, last SWITCH; /^M$/ and $arg *= 1024 * 1024, last SWITCH; /^G$/ and $arg *= 1024 * 1024 * 1024, last SWITCH; /^T$/ and $arg *= 1024 * 1024 * 1024 * 1024, last SWITCH; /^P$/ and $arg *= 1024 * 1024 * 1024 * 1024 * 1024, last SWITCH; ldie "bad unit suffix '$mod'"; } ldie "size argument '$orig_arg' evaluating to '$arg' is not a multiple of 4K = 4096\n" if ($arg % 4096) != 0; return $arg; } # DEPRECATED # # TST NOTE: avoid calling this function. As such it is conceptually wrong, # because during split-brain situations, there exists a _set_ of non-unique # primaries. I want to remove this function, but I currently can't because # other internal software at 1&1 is depending on it. # # Get actual primary node from links below actual-*/ subdirs # sub _get_actual_primary { my ($res) = @_; # TST: Presence of local device takes precedence over anything else. # This tries to workaround the most important special case of # split-brain situations, but cannot fix the problem exhaustively. llog "DEPRECATED: you are trying to uniquely identify an actual primary hostname (as seen from host $host resource $res), but this is conceptually wrong because in split-brain situations there may exist multiple ones. Use view-is-primary instead. That would be safe.\n"; return $host if device_exists($res); # The following old code is CONCEPTUALLY WRONG for split-brain situations (see NOTE above) my @primary_links = lamport_glob("$mars/resource-$res/actual-*/is-primary"); my $primary; foreach my $link (@primary_links) { if (my $val = get_link($link)) { $primary = ($link =~ qr%.*actual-([^/]+)/is-primary%)[0]; last; # Note: if there are more than one 'is-primary' links (an insane state anyway), # the first 'is-primary' link is selected. Other links are ignored. } } return $primary; } my %old_primary; sub _get_designated_primary { my ($res, $unchecked) = @_; my $fallback_to_old = 0; if (defined($unchecked) && $unchecked == -1) { $fallback_to_old = 1; $unchecked = 1; } my $val = get_link("$mars/resource-$res/primary", $unchecked); if (!defined($val) || !$val || $val eq "(none)") { if ($fallback_to_old) { $val = $old_primary{$res} if defined($old_primary{$res}); if (!defined($val) || !$val || $val eq "(none)") { my $last_lnk = "$mars/resource-$res/userspace/last-primary"; $val = get_link($last_lnk, 2); } } } else { $old_primary{$res} = $val; } return $val; } sub is_actual_primary { my ($cmd, $res, $peer) = @_; $peer = $host unless (defined($peer) && $peer); my $is_primary = get_link("$mars/resource-$res/actual-$peer/is-primary"); # notice: device presence _must not_ be used anymore. return $is_primary; } sub __conv_tv { my ($tv_sec, $tv_nsec) = @_; if (defined($tv_nsec)) { $tv_nsec = ".$tv_nsec"; } else { $tv_nsec = ""; } my ($sec,$min,$hour,$mday,$mon,$year,$wday,$yday,$isdst) = localtime(int($tv_sec)); return "$tv_sec$tv_nsec" unless defined($sec); return sprintf("%04d-%02d-%02d %02d:%02d:%02d%s", $year+1900, $mon + 1, $mday, $hour, $min, $sec, $tv_nsec); } sub __conv_errno { my ($txt, $error) = @_; return "$txt$error" if !defined($error) || ($error <= 0); $! = $error; my $res = "${txt}[$!]"; $! = 0; return $res; } sub _replace_timestamps { my ($txt, $omit_nsec) = @_; if (defined($omit_nsec) && $omit_nsec) { $txt =~ s:([0-9]{9,99})\.([0-9]{9}):__conv_tv($1):ge; } else { $txt =~ s:([0-9]{9,99})\.([0-9]{9}):__conv_tv($1,$2):ge; } $txt =~ s:((error|status)\s*=?\s*-)([0-9]+):__conv_errno($1,$3):ge; return $txt; } sub _get_text { my ($glob, $regex, $do_print, $get_count) = @_; my $text = ""; my $count = 0; foreach my $path (lamport_glob($glob)) { open(IN, "<", $path) or next; while (my $line = ) { # use regex e.g. for fetching only errors and warnings if (!$regex || $line =~ $regex) { $line = _replace_timestamps($line); $count++; if ($do_print) { print $line; llog($line); } else { $text .= $line; } } } close(IN); } return $count if defined($get_count) && $get_count; return $text; } my %global_error_texts; sub get_error_text { my ($cmd, $res) = @_; my $path = "$mars/resource-$res/logstatus-$host.status"; if (defined($global_error_texts{$path})) { return $global_error_texts{$path}; } my $text = _get_text($path, "m/^(err|warn)/i", 0); $global_error_texts{$path} = $text; return $text; } ################################################################## # helpers sub _trigger { my $code = shift || 1; if ($dry_run) { lprint "DRY_RUN: would execute trigger '$code'\n"; return; } lprint "Trigger '$code'\n" if $verbose; system("(echo -n $code > /proc/sys/mars/trigger) >/dev/null 2>&1"); } # Please do not misuse this. # Use it only _exceptionally_ for creation of new memberships & co. # Always prefer the PULL PRINCIPLE where possible. sub _push_link { my ($peer, $src, $dst) = @_; if (!$peer || !$src || !$dst) { lwarn "Bad push_link args '@_'\n"; return; } if ($peer eq $real_host) { set_link($src, $dst); return; } my $code = "p $peer $src $dst"; lprint "Pushing link '$dst' -> '$src' to peer '$peer'\n" if $verbose; _trigger($code); } # DTO do not abuse sub _push_link_foreign { my ($peer, $peer_ip, $src, $dst) = @_; if (!$peer || !$src || !$dst) { lwarn "Bad push_link args '@_'\n"; return; } if ($peer eq $real_host) { set_link($src, $dst); return; } $peer_ip = _get_ip($peer) unless $peer_ip; my $code = "P $peer $peer_ip $src $dst"; lprint "Pushing link '$dst' -> '$src' to peer '$peer' '$peer_ip'\n" if $verbose; _trigger($code); } sub _push_check { my ($peer, $peer_ip, $path) = @_; return if $peer eq $real_host; if (!$peer || !$path) { lwarn "Bad push_check args '@_'\n"; return; } $peer_ip = _get_ip($peer) unless $peer_ip; my $code = "c $peer $peer_ip $path"; lprint "Pushing check '$path' to peer '$peer' ip='$peer_ip'\n" if $verbose; _trigger($code); } sub _push_fetch { my ($peer, $peer_ip, $dir_path) = @_; if (!$peer || !$dir_path) { lwarn "Bad push_fetch args '@_'\n"; return; } $peer_ip = _get_ip($peer) unless $peer_ip; my $code = "f $peer $peer_ip $dir_path"; lprint "Trigger fetch '$dir_path' from peer '$peer' ip='$peer_ip'\n" if $verbose; lprint "$code\n" if $verbose > 1; _trigger($code); } sub _switch { my ($cmd, $res, $path, $on) = @_; my $src = $on ? "1" : "0"; $path = correct_path($path); my $old = get_link($path, 1); if ($old && $old eq $src) { lprint "${cmd} on resource $res is already activated\n" if $cmd; return; } set_link($src, $path); lprint "successfully started ${cmd} on resource $res\n" if $cmd; } sub _writable { my ($path, $on) = @_; my $oldmode = (lstat $path)[2] & 0700; my $newmode = $on ? $oldmode | 0200 : $oldmode & ~0200; lprint "chmod '$path' $oldmode $newmode"; chmod($newmode, $path) == 1 or ldie "cannot chmod '$path'\n"; } sub _get_ip { my ($peer) = @_; $peer = $host unless $peer; check_id($peer); # Dynamic programming: this also applies to --ip-$peer=$peer_ip return $known_ips{$peer} if $known_ips{$peer}; # Normally, everything should be in /mars/ips/ip-* my $ip_path = "$mars/ips/ip-$peer"; if (my $from_link = get_link($ip_path, 2)) { lprint_stderr "Using IP '$from_link' from '$ip_path'\n" if $verbose; $known_ips{$peer} = $from_link; return $from_link; } # Try any probe data my $probe_path = "$mars/probe-$real_host/$mars/ips/ip-$peer"; if (my $probe_link = get_link($probe_path, 2)) { lprint_stderr "Using PROBE IP '$probe_link' from '$probe_path'\n" if $verbose; $known_ips{$peer} = $probe_link; return $probe_link; } # Try the backups in reverse order my $backup_glob = "$mars/backups-*/ips-backup/ip-$peer"; foreach my $backup (sort reverse_cmp lamport_glob($backup_glob)) { my $check = get_link($backup, 2); if ($check) { lprint_stderr "Using BACKUP IP '$check' from '$backup'\n" if $verbose; $known_ips{$peer} = $check; return $check; } } # Try /usr/bin/getent first my $answer = `/usr/bin/getent hosts '$peer'`; chomp $answer; my $type = "GETENT"; if (!$answer || $answer !~ m/([0-9]+\.[0-9]+\.[0-9]+\.[0-9]+)/) { # Try /usr/bin/host $answer = `/usr/bin/host -t A '$peer'`; chomp $answer; my $type = "DNS"; } if ($answer && $answer =~ m/([0-9]+\.[0-9]+\.[0-9]+\.[0-9]+)/) { my $addr = $1; lprint_stderr "$type query for '$peer' found IPv4 address '$addr'\n"; $known_ips{$peer} = $addr; return $addr; } ldie "Cannot determine foreign IP for peer '$peer'\n" if $peer ne $real_host; chomp (my @info = `/sbin/ip addr show`); my $interface = ""; foreach my $line (@info) { $interface = $1 if $line =~ m#^[0-9]+:\s([a-zA-Z_0-9]+):#; next if $interface eq "lo"; if ($line =~ m#\sinet\s(\d+\.\d+\.\d+\.\d+)#) { my $from_if = $1; lprint_stderr "Using IP '$from_if' from interface '$interface'\n"; $known_ips{$peer} = $from_if; return $from_if; } } ldie "Cannot determine my own IP address ($real_host)\n"; } sub _fake_versionlink { my ($cmd, $res, $nr_links, $log_nr, $primary) = @_; my $basedir = "$mars/resource-$res"; my $make_count = 0; for (my $rounds = $nr_links; $rounds > 0 && $log_nr > 0; $rounds--) { my $new_version = sprintf("$basedir/version-%09d-$host", $log_nr); my $pri_version = sprintf("$basedir/version-%09d-$primary", $log_nr); if ($primary eq $host) { ldie "Cannot fake my own version link '$new_version'\n"; } my $retry = 3; for (;;) { my $pri_link = get_link($pri_version, 1); if ($pri_link) { lprint "Creating new versionlink '$new_version' -> '$pri_link'\n"; set_link($pri_link, $new_version); $make_count++; last; } lwarn "Primary versionlink '$pri_version' does not exist\n"; return 0 if $retry-- < 0; # During races with log-rotate, the appearace may be delayed wait_cluster($cmd, $res, $primary, 0); } $log_nr--; } lwarn "Cannot create faked versionlink\n" if !$make_count; return $make_count; } sub _set_replaylink { my ($cmd, $res, $log_nr, $primary, $msg) = @_; $msg = " -- THIS IS EXTREMELY RISKY -- any inconsistencies are on your own!" unless defined($msg); ldie "no designated primary defined\n" unless ($primary && $primary ne "(none)"); my $basedir = "$mars/resource-$res"; my $rep_path = "$basedir/replay-$host"; my $rep_val = sprintf("log-%09d-$primary,0,0", $log_nr); lprint "creating new replaylink '$rep_path' -> '$rep_val'\n"; set_link($rep_val, $rep_path); _push_link($primary, $rep_val, $rep_path) if $primary ne $host; my $ok = _fake_versionlink($cmd, $res, 2, $log_nr, $primary); set_link("$log_nr$msg", "$basedir/skip-check-$host"); return $ok; } ################################################################## # lowlevel tools sub lowlevel_ls_host_ips { my ($cmd) = @_; for my $path (lamport_glob("$mars/ips/ip-*")) { $path =~ m:/ip-(.*):; my $peer = $1; if ($peer =~ $match_reserved_id) { lwarn "IMPORTANT: for whatever reason, you have configured a syntactically invalid peer name '$peer' in /mars/ips/ .\n"; lwarn "IMPORTANT: 'none' or '(none)' is a reserved name with a special meaning.\n"; lwarn "IMPORTANT: please fix this by hand. I cannot know how this has happened, and the reasons.\n"; lwarn "IMPORTANT: please search mars-user-manual.pdf for cluster-wide deletions, and consult \"marsadm --help\" for commands like \"marsadm delete-file\" and sibligs.\n"; lwarn "IMPORTANT: I am now continuing at YOUR risk, but expect some SERIOUS PROBLEMS.\n"; } my $ip = get_link($path, 1); lprint "$peer $ip\n"; } } sub lowlevel_set_host_ip { my ($cmd, $peer, $ip) = @_; check_id($peer, 0, 1); if (!$ip) { $ip = _get_ip($peer); } my $path = "$mars/ips/ip-$peer"; my $old = get_link($path, 2) || ""; lprint "Set host '$peer' IP from '$old' to '$ip'\n"; set_link($ip, $path); foreach my $peer2 (get_total_peers()) { next if $peer2 eq $peer; _push_link($peer2, $ip, $path); } } sub lowlevel_delete_host { my ($cmd, $peer) = @_; check_id($peer, 1); my $path = "$mars/ips/ip-$peer"; my $old = get_link($path); lprint "Removing host '$peer' old IP '$old'\n"; if (!$compat_deletions) { foreach my $peer2 (get_total_peers()) { next if $peer2 eq $peer; _push_link($peer2, ".deleted", $path); } } _create_delete($path); } ################################################################## # commands sub ignore_cmd { my ($cmd, $res) = @_; lprint "ignoring command '$cmd' on resource '$res'\n"; } sub senseless_cmd { my ($cmd, $res) = @_; lprint "command '$cmd' makes no sense with MARS (ignoring on resource '$res')\n"; } sub forbidden_cmd { my ($cmd, $res) = @_; ldie "command '$cmd' on resource '$res' cannot be used with MARS (migth affect too many hosts, lead to undesired consequences)\n"; } sub nyi_cmd { my ($cmd, $res) = @_; ldie "command '$cmd' on resource '$res' is not yet implemented\n"; } sub is_module_loaded { return -d "/proc/sys/mars"; } sub set_connect_pref_list { my ($cmd, $res, $list) = @_; lwarn "This command '$cmd' is deprecated. Please try to avoid it.\n"; check_res_member($cmd, $res); my $dst = "$mars/resource-$res/connect-$host"; if ($cmd =~ m/^get-/) { my $value = get_link($dst); lprint "$value\n"; return; } if (!defined($list) || $list eq "") { lprint_stderr "info: as requested, I am deleting '$dst'\n"; _create_delete($dst); return 0; } check_id_list($list, 1, 1); set_link($list, $dst); } sub set_defaults { my ($cmd, $res, $value) = @_; my $key = $cmd; $key =~ s/set-(global-)?//; my $path = "$mars/defaults/$key"; if ($res) { check_res_member($cmd, $res) if $res; $path = "$mars/resource-$res/defaults/$key"; } if ($value) { my $flags = txt2featuresflags($cmd, $value); lprint "Set '$path' to '$flags'\n"; set_link($flags, $path); } else { lprint "Delete '$path'\n"; _create_delete($path); } } sub emergency_limit_res { my ($cmd, $res, $value) = @_; my $dst = "$mars/resource-$res/todo-$host/emergency-limit"; if ($cmd =~ m/^get-/) { my $value = get_link($dst); lprint "$value\n"; return; } ldie "percent argument '$value' isn't numeric\n" unless $value =~ m/^[0-9]+$/; ldie "percent argument '$value' isn't between 0 and 100%\n" unless ($value >= 0 && $value <= 100); set_link($value, $dst); } sub set_link_cmd { my $cmd = shift; for (;;) { my $src = shift || last; if ($cmd =~ m/^get-/) { my $value = get_link($src); lprint "$value\n"; next; } my $dst = shift || ldie "you did not supply a symlink destination for source '$src'\n"; ldie "symlink target '$dst' is not an absolute path\n" unless $dst =~ m:^/:; check_userspace($dst); my $dir = `dirname "$dst"` or ldie "path '$dst' has no dirname\n"; chomp $dir; ldie "directory '$dir' does not exist\n" unless -d $dir; set_link($src, $dst); } } sub set_sync_limit_value { my ($cmd, $value) = @_; if ($cmd !~ /-global-/) { lwarn "The old syntax '$cmd' is deprecated - please use the new syntax {get,set}-global-*\n"; } my $todo_dir = "$mars/defaults-$host"; ldie "directory '$todo_dir' does not exist\n" unless -d $todo_dir; my $dst = "$todo_dir/sync-limit"; if ($cmd =~ m/^get-/) { my $value = get_link($dst); lprint "$value\n"; return; } set_link($value, $dst); } sub create_uuid { my ($cmd) = @_; my $old_uuid = get_link("$mars/uuid", 2); ldie "Cluster was already created with uuid='$old_uuid'. " . "For safety reasons, no override is possible at marsadm level.\n" if $old_uuid; my $uuid = `echo -n \$(hostname) \$(date)`; set_link($uuid, "$mars/uuid"); finish_links(); # opportunity for errors => don't continue lprint "New cluster UUID is '$uuid'\n"; } sub _create_dirs { my ($cmd) = @_; system("mkdir $mars/ips") unless -d "$mars/ips"; system("mkdir $mars/userspace") unless -d "$mars/userspace"; system("mkdir $mars/defaults") unless -d "$mars/defaults"; system("mkdir $mars/defaults-$host") unless -d "$mars/defaults-$host"; system("mkdir $mars/todo-global") unless -d "$mars/todo-global"; mkdir("$mars/actual-$host") unless -d "$mars/actual-$host"; set_link($marsadm_version, "$mars/actual-$host/marsadm-version"); } sub _create_cluster { my ($cmd) = @_; ldie "The $mars directory does not exist.\n" unless -d $mars; my $ip = _get_ip($host); _create_dirs($cmd); create_uuid(@_) if $cmd eq "create-cluster"; set_link("0", "$mars/defaults-$host/sync-limit"); set_link("(none)", "$mars/defaults-$host/sync-pref-list"); set_link($ip, "$mars/ips/ip-$host"); set_link("1", "$mars/todo-global/deleted-$host"); } sub create_cluster { my ($cmd, $peer) = @_; ldie "cluster is already created\n" if !$force && -d "$mars/ips"; ldie "mars module is loaded, please unload first\n" if is_module_loaded(); _create_cluster(@_); } sub join_cluster { my ($cmd, $peer, $peer_ip) = @_; ldie "Cannot join myself (peer='$peer', host='$host')\n" if $peer eq $host; ldie "Directory $mars is missing\n" unless -d $mars; if (lamport_glob("$mars/resource-*")) { lwarn "DANGER: some resources already exist!\n"; ldie "DANGER: If you are sure that no resource clash is possible, re-invoke $cmd with '--force' option\n" unless $force; } _create_dirs($cmd); $peer_ip = _get_ip($peer) unless $peer_ip; # try new join method if (is_module_loaded()) { my $ip = _get_ip($host); lprint "MARS kernel module is loaded, trying the new $cmd method.\n"; my $old_uuid = get_link("$mars/uuid", 2); if (!$old_uuid || $old_uuid eq "(any)") { mkdir("$mars/ips") unless -d "$mars/ips"; mkdir("$mars/todo-global") unless -d "$mars/todo-global"; mkdir("$mars/defaults-$host") unless -d "$mars/defaults-$host"; mkdir("$mars/actual-$host") unless -d "$mars/actual-$host"; set_link("(any)", "$mars/uuid") unless $old_uuid; set_link($ip, "$mars/ips/ip-$host"); my $peer_known = link_exists("$mars/ips/ip-$peer"); if (!$peer_known) { set_link($peer_ip, "$mars/ips/ip-$peer"); } # make PRELIMINARY links, timestamp is 1 second after the Big Bang (1970) finish_links(1); _trigger(); lprint "Update local '$real_host' information\n"; update_cluster($cmd, "all", $real_host); if ($peer_known) { lprint "Update peer '$peer' information\n"; update_cluster($cmd, "all", $peer); } # write again, this time with current mars_time() set_link($ip, "$mars/ips/ip-$host"); set_link("0", "$mars/defaults-$host/sync-limit"); set_link("(none)", "$mars/defaults-$host/sync-pref-list"); my $deleted_path = "$mars/todo-global/deleted-$host"; set_link("1", $deleted_path) unless link_exists($deleted_path); finish_links(); _push_link($peer, $ip, "$mars/ips/ip-$host"); lprint "Checking uuid\n"; my $round = 0; my $new_uuid = get_link("$mars/uuid", 1); while ($round++ < 5) { _trigger(3); sleep(3); lprint "... update from $peer round $round\n"; _reset_resources(); update_cluster($cmd, "all", $peer); $new_uuid = get_link("$mars/uuid"); last if ($new_uuid && $new_uuid ne "(any)"); _push_link($peer, $ip, "$mars/ips/ip-$host"); } if ($new_uuid && $new_uuid ne "(any)") { lprint "Successfully joined cluster, uuid='$new_uuid'\n"; return; } else { lwarn "Unable to determine the remote cluster uuid.\n"; } } else { lwarn "Cluster uuid '$old_uuid' already present, cannot use the new $cmd method\n"; } } lprint "Falling back to the old ssh/rsync based $cmd method (peer='$peer' peer_ip='$peer_ip')\n"; ldie "OLD method: MARS module is loaded, please unload first before using ssh\n" if is_module_loaded(); my $ip = _get_ip($host); rsync_cmd($peer, "--max-size=1 --update $peer:$mars/ $mars/"); # check uniqness of IPs my %ips = (); foreach my $other_ip_path (lamport_glob("$mars/ips/*")) { my $other_ip = get_link($other_ip_path, 1); my $other_host = $other_ip_path; $other_host =~ s:.*/ip-::; lwarn "Host '$other_host' IP '$other_ip' is AMBIGUOUS\n" if defined($ips{$other_host}); $ips{$other_host} = $other_ip; lwarn "New IP '$ip' already exists for host '$host' at '$other_ip_path'\n" if ($ip eq $other_ip && $other_host eq $host); } _create_cluster(@_); finish_links(); rsync_cmd($peer, "--update $mars/ips/ $peer:$mars/ips/"); } sub _get_probe { my ($probe_dir, $peer, $peer_ip) = @_; $peer_ip = _get_ip($peer) unless $peer_ip; mkdir($probe_dir); my $new_mars = "$probe_dir/$mars"; _push_fetch($peer, $peer_ip, $probe_dir); return $new_mars; } # allow accumulation data from multiple peers sub _wait_probe { my ($probe_dir, $peer, $peer_ip) = @_; my $got_path = "$probe_dir/got-$peer"; lprint "Wait for probe data from '$peer' in '$got_path'\n"; while (!link_exists($got_path)) { lprint "Wait for '$got_path' to appear\n"; sleep_timeout(5); last if link_exists($got_path); _push_fetch($peer, $peer_ip, $probe_dir); } lprint "Got probe data from '$peer' in '$got_path'\n"; } sub merge_cluster { my ($cmd, $peer, $peer_ip) = @_; my $uuid = readlink("$mars/uuid"); my @resources = lamport_glob("$mars/resource-*"); my @ip_links = lamport_glob("$mars/ips/*"); if ($cmd =~ m/-list/) { print "UUID: $uuid\n"; print "IPs:\n"; foreach my $ip (@ip_links) { print "$ip\n"; } print "RESOURCEs:\n"; foreach my $i (@resources) { print "$i\n"; } return; } ldie "No peer argument given" unless $peer; ldie "Cannot merge myself (peer='$peer', host='$host')\n" if $peer eq $host; ldie "Directory $mars is missing\n" unless -d $mars; ldie "A cluster UUD '$mars/uuid' does not exist. Please use 'join-cluster instead.\n" unless link_exists("$mars/uuid"); # start_deprecated_CODE if (!ssh_cmd($peer, $ssh_probe, 1)) { lprint "Falling back to OLD ssh-based $cmd\n"; merge_cluster_old(@_); return; } lprint "Using NEW $cmd without ssh\n"; # end_deprecated_code if ($peer_ip) { lprint "Given on command line: peer '$peer' has IP '$peer_ip'\n"; } else { $peer_ip = _get_ip($peer); ldie "Cannot determine IP for peer '$peer'. Please give the IP on the command lime.\n" unless $peer_ip; lprint "Determined: peer '$peer' appears to have IP '$peer_ip'\n"; } my $peer_ip_path = "$mars/ips/ip-$peer"; my $probe_ip = get_link($peer_ip_path, 1); unless ($probe_ip) { finish_links(); lprint "Need to set preliminary peer IP\n"; set_link($peer_ip, $peer_ip_path); # Make PRELIMINARY links, timestamp is 1 second after the Big Bang (1970) finish_links(1); _trigger(3); } my $probe_dir = "$mars/probe-$real_host"; system("rm -rf $probe_dir"); my $new_mars = _get_probe($probe_dir, $peer, $peer_ip); _wait_probe($probe_dir, $peer, $peer_ip); my %ips; my %old_peers; my %merge_peers; foreach my $old_peer (get_total_peers()) { $ips{$old_peer} = _get_ip($old_peer); $old_peers{$old_peer} = 1; $merge_peers{$old_peer} = 1; lprint " old peer '$old_peer' ip='$ips{$old_peer}'\n"; } foreach my $ip_path (lamport_glob("$new_mars/ips/ip-*")) { my $ip = get_link($ip_path, 1); next unless $ip; my $new_peer = $ip_path; $new_peer =~ s:^.+/ip-::; next if $merge_peers{$new_peer}; $ips{$new_peer} = _get_ip($new_peer); $merge_peers{$new_peer} = 1; lprint " new peer '$new_peer' ip='$ips{$new_peer}'\n"; } my %new_res; foreach my $ip (lamport_glob("$new_mars/resource-*")) { my $new_res = $ip; $new_res =~ s:^.+/resource-::; $new_res{$new_res} = 1; } my $new_uuid = readlink("$new_mars/uuid"); ldie "Other cluster peer '$peer' has no UUID\n" unless $new_uuid; my %old_res; foreach my $str (get_total_resources()) { $old_res{$str} = 1; } if ($new_uuid eq $uuid) { lprint "Other cluster peer '$peer' has the same UUID.\n"; lprint "No resource name checking necessary.\n"; lprint "Operation '$cmd' will work logically idempotent.\n"; } else { # Check that both sets of resources are logically disjoint lprint "Other cluster peer '$peer' has a different UUID, checking for resource name conflicts.\n"; my %inter_res = key_intersect(\%old_res, \%new_res); my $conflicts = 0; foreach my $res (keys(%inter_res)) { lwarn "common resource '$res'\n"; $conflicts++; } ldie "cannot merge cluster '$peer' due to $conflicts conflicts\n" if $conflicts; } lprint "MERGE '$peer'\n"; foreach my $new_peer1 (keys(%merge_peers)) { lprint "fetch '$new_peer1'\n"; _push_fetch($new_peer1, $ips{$new_peer1}, "/"); foreach my $new_peer2 (keys(%merge_peers)) { lprint " push '$new_peer2'\n"; my $ip_path = "$mars/ips/ip-$new_peer2"; _push_link_foreign($new_peer1, $ips{$new_peer1}, $ips{$new_peer2}, $ip_path); } } system("rm -rf $probe_dir"); } # deprecated, to disappear sub merge_cluster_old { my ($cmd, $peer) = @_; ldie "ssh is disabled\n" unless $ssh_port; my $uuid = readlink("$mars/uuid"); my @resources = lamport_glob("$mars/resource-*"); my @ip_links = lamport_glob("$mars/ips/*"); # check connections my $check_cmd = "uname -a"; system("$check_cmd") == 0 or ldie "oops, 'uname is not installed'\n"; system("rsync --help > /dev/null") == 0 or ldie "Command 'rsync' is not installed\n"; ssh_cmd($peer, $ssh_probe); my @old_peers; foreach my $ip (@ip_links) { my $old_peer = $ip; $old_peer =~ s:^.+/ip-::; next if $old_peer eq $host; next if $old_peer eq $real_host; ssh_cmd($old_peer, $ssh_probe); push @old_peers, $old_peer; } # check whether merge-cluster is possible my %total_res; foreach my $res (@resources) { $total_res{$res}++; } my $ssh_cmd = make_ssh_cmd($peer) . " marsadm merge-cluster-list"; my $answer = `$ssh_cmd`; $answer =~ m/^UUID: (.*)$/m or ldie "cannot determine remote UUID from '$answer'\n"; my $other_uuid = $1; ldie "Other cluster peer '$peer' has no UUID\n" unless $other_uuid; if ($other_uuid eq $uuid) { lprint "Other cluster peer '$peer' has the same UUID.\n"; lprint "No resource name checking necessary.\n"; lprint "Operation '$cmd' will therfore work logically idempotent.\n"; } else { if (link_exists("$mars/tree-$peer")) { lwarn "A valid tree signature '$mars/tree-$peer' already exists, thus it appears to be already merged!\n"; ldie "Aborting for saftey. Override via --force only if you know what you are doing!\n" unless $force; } # Check that both sets of resources are disjoint lprint "Other cluster peer '$peer' has a different UUID, checking for resource name conflicts.\n"; my @other_resources; my @conflicts; my $copy = $answer; $copy =~ s/\A.*?RESOURCEs:\n//ms; while ($copy) { $copy =~ s/\A(.*)\n$//m; my $other_res = $1; last unless $other_res; push @other_resources, $other_res; if ($total_res{$other_res}++) { push @conflicts, $other_res; } } if (@conflicts) { lprint "CONFLICTS:\n"; foreach my $res (@conflicts) { lprint "\t$res\n"; } ldie "Cannot $cmd: some resource directories exist at both clusters with same name.\nThis cannot be overriden.\nPlease resolve the conflict by hand.\n"; } lprint "List of total resources:\n"; foreach my $res (keys(%total_res)) { lprint "\t$res\n"; } # INTERNAL, for debugging and error analysis: backup the old uuid symlink my $backup = "$backup_dir/uuid-backups"; system("mkdir -p $backup; cp -a $mars/uuid $backup/") unless link_exists("$backup/uuid"); } # Start the "hot phase" my $rsync_cmd = "--max-size=1"; rsync_cmd($peer, "$rsync_cmd $peer:$mars/uuid $mars/uuid"); foreach my $old_peer (@old_peers) { rsync_cmd($old_peer, "$rsync_cmd $mars/uuid $old_peer:$mars/uuid"); } $rsync_cmd .= " --update --ignore-existing"; rsync_cmd($peer, "$rsync_cmd $peer:$mars/ $mars/"); rsync_cmd($peer, "$rsync_cmd $mars/ $peer:$mars/"); } sub leave_cluster { my ($cmd) = @_; ldie "mars kernel module is not loaded. This is needed for communication with some other hosts!\n" if !is_module_loaded(); my $check = "/mars/resource-*/data-$host"; ldie "I am member of some resources\n" if lamport_glob($check) && !$force; wait_cluster($cmd) unless $force; foreach my $path (lamport_glob("$mars/actual-*/*-$host")) { _create_delete($path); } _create_delete("$mars/ips/ip-$host"); finish_links(); wait_cluster($cmd) unless $force; foreach my $path (lamport_glob("$mars/actual-*/*-$host")) { _create_delete($path); } _create_delete("$mars/todo-global/deleted-$host"); finish_links(); wait_cluster($cmd) unless $force; while (-f "$mars/ips/ip-$host" && $timeout_val >= 0) { sleep_timeout(3, 1); } system("rmmod mars") if $host eq $real_host; foreach my $path (lamport_glob("$mars/{,resource-*/}{todo,actual}-*/*-$host")) { unlink($path); } foreach my $path (lamport_glob("$mars/{,resource-*/}{todo,actual}-$host")) { rmdir($path); # at least try it } } sub create_res { my ($cmd, $res, $dev, $appear, $size_arg) = @_; my $create = ($cmd eq "create-resource"); ldie "undefined device or size argument\n" unless $dev; $appear = $res if !$appear; check_id($appear, 0, 1) if $appear; my $resdir = "$mars/resource-$res"; if ($create) { if (-d $resdir) { lwarn "resource directory '$res' already exists\n"; my @host_list = lamport_glob("$resdir/replay-*"); if (@host_list) { my $h_list = join(',', map({ $_ =~ s:.*/replay-::; $_; } (@host_list))); lwarn "DANGER: hosts '$h_list' are already member of resource '$res'.\n"; ldie "REFUSING to trash your resource!\n" unless $force; } } lprint "creating new resource '$res'\n"; } else { if (link_exists("$resdir/data-$host")) { lwarn "resource '$res' has been already joined -- this is dangerous!\n"; ldie "refusing dangerous operation\n" unless $force; } else { lprint "joining to existing resource '$res'\n"; } } my $size = 0; if (-b $dev) { ldie "block device '$dev' must be an absolute path starting with '/'\n" unless $dev =~ m/^\//; use Fcntl 'SEEK_END', 'O_RDONLY', 'O_RDWR', 'O_EXCL'; my $flags = O_RDWR | O_EXCL; if ($force) { $flags = O_RDONLY; } sysopen(TEST, $dev, $flags) or ldie "cannot open device '$dev' for exclusive rw access\n"; $size = sysseek(TEST, 0, SEEK_END); close(TEST); lprint "block device '$dev': determined size = $size bytes\n"; if ($size_arg) { my $new_size = get_size($size_arg); ldie "size argument '$size_arg' is smaller than device size '$size'\n" unless $new_size <= $size; lprint "reducing size from $size to $new_size\n"; $size = $new_size; } } elsif (-f $dev) { lprint "Using regular (sparse) file '$dev' as an image file\n"; $size = get_size($dev); if ($size > 0) { $dev = ""; } else { ldie "bad parameter '$dev': must be either a block device name, or size followed by k or m or g or t\n"; } } else { ldie "Device argument '$dev' does not exist or is no usable block device.\n" } ldie "implausible size $size" unless $size > 4096 * 16; # smaller floppies should not exist ;) # check for uniqeness of $appear if ($appear) { foreach my $old_dev (lamport_glob("$mars/resource-*/device-$host")) { $old_dev =~ m:/resource-([^/]+)/:; my $old_res = $1; next if (!$old_res || $old_res eq $res || $old_res eq "(none)"); my $old_name = get_link($old_dev); if ($old_name eq $appear) { if (link_exists("$mars/resource-$old_res/data-$host")) { ldie "Device name '/dev/mars/$old_name' already exists at resource '$old_res'\n"; } else { lwarn "Device name '/dev/mars/$old_name' already exists in another resource '$old_res'.\n"; lwarn "This does no harm, but may be confusing.\n"; lwarn "Please name your devices equal to the resource names by convention.\n"; } } } # warn if devices are named differently throughout the cluster foreach my $old_dev (lamport_glob("$resdir/device-*")) { my $old_name = get_link($old_dev); next unless $old_name; next if $old_name eq "(none)"; if ($old_name ne $appear) { $old_dev =~ m:/device-(.+)$:; my $old_host = $1; next unless $old_host; lwarn "your name '/dev/mars/$appear' differs from '/dev/mars/$old_name' on host '$old_host'.\n"; lwarn "this does no harm, but may be confusing.\n"; } } } err_purge_res($cmd, $res); if (!$create) { _activate_resource($cmd, $res, $host); } my $max_primary_retry = 5; primary_retry: my $primary; my $replay_nr = -1; my $use_rsync = 0; if ($create) { mkdir($resdir); ldie "could not create resource '$res'\n" unless -d $resdir; set_link($size, "$resdir/size"); } else { # join my $ip = _get_ip($host); # For safety, try to get the very newest infos. # Reason: newer kernel modules will fetch non-member resource infos less frequently. # Therefore we shift some responsibility for non-member -> member transitions to userspace. $primary = _get_designated_primary($res, 1); for (my $retry = 0; $retry < 3; $retry++) { last if (defined($primary) && $primary); lprint "Trying to get primary info\n"; my $condition = "return _get_designated_primary(\"$res\", 1);"; $primary = update_cluster($cmd, $res, "*", $condition); } my $max_retry = 5; for (;;) { $primary = _get_designated_primary($res, 1); last if $max_retry-- < 0; if ($primary && $primary ne $host && $primary ne "(none)") { # first check whether symlink information appears to be recent enough last if recent_cluster($cmd, $res, $primary); # for safety, fetch newest infos from last known primary if ($max_retry >= 3) { lprint "Trying update-cluster from '$primary'\n"; update_cluster($cmd, $res, $primary); next; } elsif ($max_retry >= 1) { lprint "Trying full update-cluster\n"; update_cluster($cmd, $res); next; } $use_rsync++; rsync_cmd($primary, "--max-size=1 --update $primary:$mars/resource-$res/ $mars/resource-$res/", $res, 1); next; } lprint "Using update-cluster for primary determination\n"; update_cluster($cmd, $res); next if $max_retry > 2; my @peers = map { m:^$mars/ips/ip-(.+):; $1 } lamport_glob("$mars/ips/ip-*"); if (!@peers || $max_retry < 0) { lwarn "cannot get a relevant primary\n"; last; } # We are desperate. Try to fetch the directory from anywhere. my $peer = shift @peers; next if $peer eq $host; next if $peer eq $real_host; $use_rsync++; rsync_cmd($peer, "--max-size=1 --update $peer:$mars/resource-$res $mars/", 1); } # retry loop ldie "resource '$res' does not exist\n" unless -d $resdir; ldie "resource '$res' has no designated primary\n" unless $primary && $primary ne "(none)"; finish_links(); wait_cluster($cmd, $res); my $res_size = get_link("$mars/resource-$res/size", 1); if ($size < $res_size) { lwarn "size of new device is only $size, but should be $res_size\n"; ldie "refusing to join due to bad size\n" unless $force; } elsif ($size > $res_size) { lprint "Your physical device has size $size, which is larger than the logical resource size $res_size.\n"; lprint "This does no harm, but you are wasting some space.\n"; } $primary = _get_designated_primary($res); ldie "Sorry, joining is only possible if a designated primary exists.\n" if $primary eq "(none)"; ldie "implausible state: I ($host) am already designated primary of resource '$res' which I just wanted to join\n" if $primary eq $host; ldie "my ip '$ip' is not registered -- please run 'join-cluster' first\n" unless link_exists("$mars/ips/ip-$host"); check_sync_startable(@_); my $replay = get_link("$resdir/replay-$primary"); if ($replay =~ m/^log-([0-9]+)-/) { $replay_nr = $1; $replay_nr =~ s/^0+//; } else { lwarn "'$res' cannot determine current primary '$primary' logfile number from '$replay'.\n"; if ($max_primary_retry-- >= 0) { wait_cluster($cmd, $res); goto primary_retry; } ldie "Primary '$primary' info is missing for '$res'.\n"; } } # purge before complaining log_purge_res($cmd, $res); err_purge_res($cmd, $res); finish_links(); _wait_delete(); # check for remains from former incidents my $max_retry = 3; while ($max_retry-- > 0) { my @remains = (); foreach my $remain (lamport_glob("$resdir/{log,version}-*-$host")) { if (!$create) { $remain =~ m/-([0-9]+)-$host/; my $nr = $1; next if $nr < $replay_nr; } lwarn "OLD REMAIN '$remain'\n"; push @remains, $remain; } last if !@remains; lwarn "Resource directory has some old remains.\n"; if ($force && !$create) { foreach my $remain (@remains) { lwarn "REMOVING remain '$remain'"; _create_delete($remain); } finish_links(); _wait_delete(); next; } lwarn "First cleanup.\n"; my $extra_op = $create ? "delete-resource" : "log-purge-all"; lwarn "If you really want to $cmd, run 'marsadm $extra_op' first.\n"; ldie "REFUSING $cmd\n" unless $force; } my $file = "$resdir/data-$host"; if (!$dev) { lwarn "link '$file' already exists - reusing\n" if link_exists($file); lprint "setup sparse file '$file' with size $size\n"; open(OUT, ">>", $file) or ldie "could not open '$file'\n"; truncate(OUT, $size) or ldie "truncate to size $size failed\n"; close OUT; } else { lprint "using existing device '$dev'\n"; set_link($dev, $file); _push_link($primary, $dev, $file) if $primary; } my $max_join_retry = 5; join_retry: if ($appear) { lprint "resource '$res' will appear as local device '/dev/mars/$appear'\n"; my $device_path = "$resdir/device-$host"; set_link($appear, $device_path); _push_link($primary, $appear, $device_path) if $primary; } mkdir("$mars/userspace") unless -d "$mars/userspace"; mkdir("$resdir/userspace") unless -d "$resdir/userspace"; mkdir("$resdir/defaults") unless -d "$resdir/defaults"; mkdir("$resdir/defaults-$host"); mkdir("$resdir/local-$host"); mkdir("$resdir/actual-$host"); set_link("0", "$resdir/actual-$host/if-on"); my $todo = "$resdir/todo-$host"; mkdir($todo); set_link("0", "$todo/attach"); set_link("1", "$todo/connect"); set_link("1", "$todo/sync"); set_link("1", "$todo/allow-replay"); unlink("$resdir/syncstatus-$host"); my $replay_ok = 1; if ($create) { set_link($host, "$resdir/primary"); set_link($host, "$resdir/userspace/last-primary"); set_link($size, "$resdir/syncstatus-$host"); my $startnr = get_link("$resdir/maxnr", 2); if (defined($startnr) && $startnr ne "" && $startnr > 0) { $startnr += 100000; } else { $startnr = 1; } my $fmt_old = sprintf("%09d", $startnr - 1); my $fmt = sprintf("%09d", $startnr); set_link("log-$fmt-$host,0,0", "$resdir/replay-$host"); my $logfile = "$resdir/log-$fmt-$host"; safe_creat($logfile) unless $dry_run; my $old_fake = "00000000000000000000000000000000,log-$fmt_old-$host,0"; set_link("$old_fake:", "$resdir/version-$fmt_old-$host"); set_link("00000000000000000000000000000000,log-$fmt-$host,0:$old_fake", "$resdir/version-$fmt-$host"); set_link("$startnr", "$resdir/skip-check-$host") if $startnr > 1; set_link("$startnr", "$resdir/maxnr"); finish_links(); lprint "successfully created resource '$res'\n"; } else { # join $replay_ok = _set_replaylink($cmd, $res, $replay_nr, $primary, ""); set_link("0", "$resdir/syncstatus-$host"); finish_links(); _trigger(3); if ($use_rsync && $ssh_port) { rsync_cmd($primary, "--max-size=1 --update $file $primary:$mars/resource-$res/", 1); } wait_cluster($cmd, $res, $primary); } set_link("1", "$todo/attach"); finish_links(); _trigger(3); _reset_resources(); if (!$create) { # Check for races with logrotate for (my $check_redo = 3; $check_redo > 0; $check_redo--) { wait_cluster($cmd, $res, $primary, 0); # Split brain or orphanness can happen due to _inherent_ races # with fresh appearance of the new replaylink. lprint "Checking '$res' for split brain\n"; %detected_splits = (); my $split_ok = detect_splitbrain($res, 1); lprint "redo=$check_redo resource '$res' split_ok=$split_ok replay_ok=$replay_ok\n"; if (!$split_ok || !$replay_ok) { lwarn "Race '$res' with log-rotate detected.\n"; if ($max_join_retry-- >= 0) { wait_cluster($cmd, $res, "*", 0); invalidate_res_phase1($cmd, $res); wait_cluster($cmd, $res, $host, 0); invalidate_res_phase2($cmd, $res); lwarn "Restarting '$res' due to unstable situation at primary '$primary'\n"; goto join_retry; } } } lprint "Successfully joined resource '$res' to primary '$primary'\n"; } systemd_trigger_extern($cmd, $res); } sub _fetch_transitive_peers { lprint "Computing the transitive closure of peers:\n"; my @peers = get_total_peers(); my @old_peers; my $probe_dir = "$mars/probe-$real_host"; system("rm -rf $probe_dir"); while (scalar(@peers) > scalar(@old_peers)) { lprint "Current #peers: " . scalar(@peers) . "\n"; @old_peers = @peers; foreach my $peer (@peers) { next if $peer eq $real_host; next if $peer =~ $match_reserved_id; _get_probe($probe_dir, $peer); } foreach my $peer (@peers) { next if $peer eq $real_host; next if $peer =~ $match_reserved_id; _wait_probe($probe_dir, $peer); } _reset_resources(); @peers = get_total_peers(); } lprint "Final #peers: " . scalar(@peers) . "\n"; } sub split_cluster { my ($cmd) = @_; llog "DEPRECATED: $cmd\n"; # start_deprecated_CODE foreach my $peer (get_total_peers()) { next unless $ssh_port; next if $peer eq $real_host; if (!ssh_cmd($peer, $ssh_probe, 1)) { lprint "Falling back to OLD ssh-based $cmd\n"; split_cluster_old(@_); return; } } lprint "Using NEW $cmd without ssh\n"; # end_deprecated_code my $probe_dir = _fetch_transitive_peers(); system("rm -rf $probe_dir"); lprint "IPS:\n"; my @peers = get_total_peers(); foreach my $peer (@peers) { my $ip = _get_ip($peer); lprint " Peer='$peer' IP='$ip'\n"; } lprint "PEER GROUPS:\n"; my $nr = 0; my %touched; my %peer_groups; foreach my $peer1 (@peers) { next if ($touched{$peer1}); my %members; $members{$peer1} = 1; foreach my $res (get_total_resources($peer1)) { lprint " peer '$peer1' res '$res'\n" if $verbose > 2; my @peers2 = get_total_peers($res); foreach my $peer2 (@peers2) { next if $members{$peer2}; lprint " peer '$res' res '$res' '$peer2'\n" if $verbose > 2; $touched{$peer2} = 1; $members{$peer2} = 1; } } $peer_groups{$peer1} = \%members; $nr++; lprint " #$nr: $peer1 => " . join(",", sort alphanum_cmp keys(%members)) . "\n"; } if ($nr <= 1) { lprint "Nothing to do: there are only $nr group(s)\n"; system("rm -rf $probe_dir"); return; } # PHASE 0: create emergency backup unless ($dry_run) { my $ips_backup = "$backup_dir/ips-backup"; lprint "Phase 0: create backup at $ips_backup\n"; system("mkdir -p $ips_backup") and ldie "Cannot create '$ips_backup'\n"; system("cp -a $mars/ips/* $ips_backup/"); } # PHASE 1: delete all lprint "Phase 1: delete all IPs\n"; foreach my $peer (@peers) { my $ip_path = "$mars/ips/ip-$peer"; _create_delete($ip_path); } finish_links(); _wait_delete(); _reset_resources(); foreach my $peer (@peers) { my $ip_path = "$mars/ips/ip-$peer"; foreach my $targ (@peers) { my $targ_ip = _get_ip($targ); _push_link_foreign($targ, $targ_ip, ".deleted", $ip_path); } } finish_links(); _reset_resources(); lprint "Wait for effect\n"; sleep(10); # PHASE 2: regenerate all groups lprint "Phase 2: re-create all groups\n"; foreach my $group (sort alphanum_cmp keys(%peer_groups)) { my $group_ip = _get_ip($group); lprint " GROUP '$group' ip '$group_ip'\n" if $verbose; my $list = $peer_groups{$group}; my %members = %$list; foreach my $peer (sort alphanum_cmp keys(%members)) { my $ip_path = "$mars/ips/ip-$peer"; my $ip = _get_ip($peer); lprint " PEER '$peer' '$ip'\n" if $verbose; foreach my $targ (sort alphanum_cmp keys(%members)) { my $targ_ip = _get_ip($targ); _push_link_foreign($targ, $targ_ip, $ip, $ip_path); } } } finish_links(); system("rm -rf $probe_dir"); _reset_resources(); } # deprecated, to disappear sub split_cluster_old { my ($cmd) = @_; ldie "ssh is disabled\n" unless $ssh_port; # checks lprint "Checking...\n"; my $ips_backup = "$backup_dir/ips-backup"; system("mkdir -p $ips_backup") and ldie "OOps"; system("cp -a $mars/ips/* $ips_backup/"); foreach my $peer (lamport_glob("$ips_backup/ip-*")) { $peer =~ s:^$ips_backup/ip-::; ssh_cmd($peer, $ssh_probe); } lprint "Creating IP backups...\n"; # create ips backup foreach my $peer (lamport_glob("$ips_backup/ip-*")) { $peer =~ s:^$ips_backup/ip-::; next if $peer eq $host; rsync_cmd($peer, "--max-size=1 --update $peer:$mars/ips/ $ips_backup/", 1); } system("cp -a $mars/ips/* $ips_backup/"); my @peers = map { $_ =~ s:^$ips_backup/ip-::; $_ } (lamport_glob("$ips_backup/ip-*")); ldie "Empty peer list\n" unless @peers; foreach my $peer (@peers) { next if $peer eq $host; rsync_cmd($peer, "--max-size=1 --update $ips_backup $peer:$backup_dir/"); } # Check symmetry lprint "Checking symmetry of resource memberships...\n"; my $data_cmd = "ls $mars/resource-*/data-*"; my $standard = qx:$data_cmd:; ldie "Cannot get local resource participants.\n" unless $standard; foreach my $peer (@peers) { next if $peer eq $host; my $ssh_cmd = make_ssh_cmd($peer) . " '$data_cmd'"; my $compare = qx:$ssh_cmd:; ldie "Asymmetric $mars/resource-*/data-* with peer $peer\n" unless $compare eq $standard; } # delete all hosts everywhere lprint "Deleting all peers IPs everywhere...\n"; foreach my $peer (@peers) { print "--- peer $peer\n"; my $cmd = "rm -rf $mars/ips"; ssh_cmd($peer, $cmd, 1); } # wait until all peer threads are gone lprint "Waiting...\n"; sleep(10); my $cond_cmd = "(( \$(ps ax | grep mars_peer | grep -v grep | wc -l) == 0 ))"; for (my $loop = 0; $loop < $timeout; $loop++) { my $running = 0; foreach my $peer (@peers) { my $status = ssh_cmd($peer, $cond_cmd, 1); $running++ if $status; } last if !$running; print "--- $running peers have running peer threads.\n"; sleep(1); } for (my $loop = 0; $loop < 3; $loop++) { # re-create all groups everywhere lprint "RE-CREATE peer IPs...\n"; foreach my $peer (@peers) { print "--- peer $peer\n"; my $cmd = "rm -f $mars/todo-global/delete* $mars/actual-*/msg-*connection-*; "; $cmd .= "shopt -s nullglob; "; $cmd .= "for i in $mars/resource-*; do if ! [[ -e \$i/data-$peer ]] && ! [[ -e \$i/replay-$peer ]]; then rm -rf $backup_dir/\${i##*/}; mv \$i $backup_dir/; fi; done; "; $cmd .= "mkdir -p $mars/ips; "; my $sub_list = "{ for dir in $mars/resource-*/data-$peer; do (cd \${dir%/*} && for i in data-*; do echo \$i; done); done; echo x-$peer; }"; my $sub_cmd = "echo RESTORE IP \$j; cp -a $ips_backup/ip-\$j $mars/ips/"; $cmd .= "for j in \$($sub_list | cut -d- -f2- | sort -u); do $sub_cmd; done"; lprint "$cmd\n"; ssh_cmd($peer, $cmd, 1); } } } sub leave_res_phase0 { my ($cmd, $res) = @_; check_not_primary($cmd, $res, 1); my $errors = 0; foreach my $tmp (lamport_glob("$mars/resource-$res/todo-$host/*")) { next if $tmp =~ m:/delete:; my $status = get_link($tmp, 2); if ($status) { lwarn "switch '$tmp' is not off\n"; $errors++; } } foreach my $tmp (lamport_glob("$mars/resource-$res/actual-$host/{is-,logfile-}*")) { my $status = get_link($tmp); if ($status) { lwarn "running status '$tmp' is not off\n"; $errors++; } } if (!$force) { check_status($cmd, $res, "is-attached", 0, 0, 1); ldie "there were $errors errors.\n" if $errors; } return 0; } sub leave_res_phase1 { my ($cmd, $res) = @_; set_link("1", "$mars/resource-$res/work-$host"); _create_delete("$mars/resource-$res/replay-$host"); _create_delete("$mars/resource-$res/data-$host"); _create_delete("$mars/resource-$res/syncstatus-$host"); my $syncpos = "$mars/resource-$res/syncpos-$host"; _create_delete($syncpos) if link_exists($syncpos); my $skip_check = "$mars/resource-$res/skip-check-$host"; _create_delete($skip_check) if link_exists($skip_check); my $vstatus = "$mars/resource-$res/verifystatus-$host"; _create_delete($vstatus) if link_exists($vstatus); _create_delete("$mars/resource-$res/device-$host"); _create_delete("$mars/resource-$res/actsize-$host"); foreach my $dir (lamport_glob("$mars/resource-$res/*-$host/")) { foreach my $tmp (lamport_glob("${dir}*")) { _create_delete($tmp); } _create_delete($dir); } finish_links(); return 0; } # wait for deletions (avoid races with following commands) sub leave_res_phase2 { my ($cmd, $res) = @_; _wait_delete(); _reset_resources(); $force = 0; # this would be too dangerous log_purge_res($cmd, $res); err_purge_res($cmd, $res); return 0; } sub leave_res_phase3 { my ($cmd, $res) = @_; _wait_delete(); _reset_resources(); err_purge_res($cmd, $res); # deprecated: hindering for new deletion method if ($compat_deletions && $host eq $real_host) { system("rm -f $mars/resource-$res/log-*"); } return 0; } sub leave_res_phase4 { my ($cmd, $res) = @_; # no longer generate this resource systemd_trigger_extern($cmd); return 0; } sub delete_res_phase1 { my ($cmd, $res) = @_; my $basedir = "$mars/resource-$res"; # preconditions if (! -d $basedir) { lprint "resource directory '$basedir' does no longer exist.\n"; return 0; } my @host_list = get_total_peers($res); my $cnt = scalar(@host_list); if ($cnt > 0) { my $h_list = join(',', @host_list); ldie "resource '$res' is not empty: first remove the hosts '$h_list' via leave-resource\n" unless $force; lwarn "BRUTE FORCE resource destruction: '$res' has $cnt members ($h_list) THESE ARE FINALLY TRASHED right now -- you are RESPONSIBLE for any subsequent problems.\n"; } for my $path (`find $basedir/* | sort -r`) { chomp $path; next if $path =~ m:/(maxnr$|\.deleted-):; _create_delete($path); } set_link("1", "$mars/resource-$res/work-$host"); finish_links(); _wait_delete(); _reset_resources(); return 0; } sub delete_res_phase2 { my ($cmd, $res) = @_; systemd_trigger_extern($cmd, $res); return 0; } sub logrotate_res { my ($cmd, $res) = @_; # report any upgrades / downgrades set_link($marsadm_version, "$mars/actual-$host/marsadm-version"); if ($force) { lwarn "operation $cmd --force is DANGEROUS\n"; } else { my $no_warn = ($cmd =~ /cron/); return if check_primary($cmd, $res, 0, 1, $no_warn); } my $log_glob = "$mars/resource-$res/log-*-$host"; lprint "get logfiles '$log_glob'\n"; my @paths = lamport_glob($log_glob); my $last; if (@paths) { @paths = sort alphanum_cmp @paths; lprint "found " . scalar(@paths) . " logfiles.\n"; $last = pop(@paths); } else { # By definition, logrotate an old logfile must have been produced # by the _same_ host. Only relevant for dangerous cron --force. lprint "nothing to $cmd: no old logfiles for '$host' exist.\n"; return; } if (-z $last) { lprint "an empty logfile '$last' already exists, nothing to do.\n" if $verbose; return; } # prevent too many small logfiles when secondaries are not catching up if (scalar(@paths) >= $thresh_logfiles) { my @stat = stat($last); my $size = 0; if (@stat) { $size = $stat[7] / (1000 * 1000 * 1000); } if ($size < $thresh_logsize) { lprint "current logfile '$last' is smaller than $thresh_logsize GB, skipping logrotate.\n" if $verbose; return; } } my $nr = $last; $nr =~ s/^.*log-([0-9]+)-.+$/$1/; my $next = sprintf("$mars/resource-$res/log-%09d-$host", $nr + 1); ldie "logfile '$next' already exists\n" if file_exists($next); safe_creat($next) unless $dry_run; my $startnr = get_link("$mars/resource-$res/maxnr", 1); $startnr = $nr + 1 if ($nr >= $startnr); set_link("$startnr", "$mars/resource-$res/maxnr"); } sub _get_deletable_logfiles { my ($cmd, $res) = @_; my $min = -1; my $max = -1; my %replay_links; my $max_restarts = 3; # Race prevention: get the replay links first, afterwards the logfiles restart: my @replay_paths = lamport_glob("$mars/resource-$res/replay-*") or ldie "cannot find any replay symlinks\n"; foreach my $path (@replay_paths) { my $target = get_link($path, 1); if (!$target) { ldie "cannot get link '$path'\n" if $max_restarts-- < 0; goto restart; } $replay_links{$path} = $target; } my @log_paths = lamport_glob("$mars/resource-$res/log-*") or ldie "cannot find any logfiles\n"; foreach my $path (@log_paths) { $path =~ m/\/log-([0-9]+)-/; my $nr = $1; $min = $nr if ($nr < $min || $min < 0); $max = $nr if ($nr > $max || $max < 0); } foreach my $path (@replay_paths) { my $target = $replay_links{$path}; $target =~ m/^log-([0-9]+)/; my $nr = $1; $max = $nr if ($nr < $max || $max < 0); } return ($min, $max); } # for old deletion method (to disappear) my %delete_nrs; sub _get_delete_nr { my ($basedir, $total) = @_; return @{$delete_nrs{$basedir}} if defined($delete_nrs{$basedir}); _scan_caches() unless %total_peers; my $min_nr = 0; my $max_nr = 0; foreach my $path (lamport_glob("$basedir/deleted-*")) { $path =~ m:/deleted-(.+):; my $peer = $1; next unless $peer; next unless $total || $peer eq $real_host || $any_peers{$peer}; my $link = get_link($path, 1); $link =~ m/0*([0-9]+)/; $min_nr = $1 if (defined($1) && ($1 < $min_nr || !$min_nr)); $max_nr = $1 if (defined($1) && $1 > $max_nr); } my @deletes = lamport_glob("$basedir/delete-*"); foreach my $path (@deletes) { $path =~ m:/delete-0*([0-9]+)-(.+):; my $nr = $1; my $peer = $2; next unless $peer; next unless $total || $peer eq $real_host || $any_peers{$peer}; $max_nr = $nr if (defined($nr) && $nr > $max_nr); } my $nr_links = $max_nr - $min_nr; if ($nr_links > $max_deletions / 2) { $verbose = 1 if $verbose <= 0; lwarn "Too many deletion links have accumulated into directory $basedir/.\n"; lwarn "Probably your networking / your firewall rules / another setup problem is causing this, and your monitoring does not notice it.\n"; lwarn "Please fix it.\n"; lwarn "When necessary, remove $basedir/delete-* link by hand.\n"; # There might be holes in the sequence numbers of delete-* links. # Don't account for them when it comes to aborts. if (scalar(@deletes) > $max_deletions) { lwarn "URGENT: please fix it.\n"; ldie "Aborting for safety reasons\n" if $cron_mode && !$force; } } $delete_nrs{$basedir} = [$min_nr, $max_nr]; return @{$delete_nrs{$basedir}}; } sub _set_delete_nr { my ($basedir, $min_nr, $max_nr) = @_; $delete_nrs{$basedir} = [$min_nr, $max_nr]; } my %del_dirs; my %del_targets; sub _replace_delete { my ($basedir, $target) = @_; if (!$del_dirs{$basedir}) { foreach my $link (lamport_glob("$basedir/delete-*")) { my $targ = readlink($link); next unless $targ; $del_targets{$targ} = $link; } $del_dirs{$basedir} = 1; } my ($min_nr, $delete_nr) = _get_delete_nr($basedir, 1); if (defined($del_targets{$target})) { my $old = $del_targets{$target}; $old =~ m:/delete-([0-9]+)-:; my $old_nr = $1; return $old if $old_nr >= $min_nr; } my $new = sprintf("$basedir/delete-%09d-$real_host", ++$delete_nr); _set_delete_nr($basedir, $min_nr, $delete_nr); return $new; } my $nr_deletions = 0; sub _create_delete { my ($target) = @_; ldie "cannot delete: '$target' is no absolute path\n" unless $target =~ m:^/:; if (!$compat_deletions) { lprint "delete '$target'\n" if $verbose; set_link(".deleted", $target); return; } my $global_path = "$mars/todo-global"; my $new = _replace_delete($global_path, $target); lprint "create symlink $new -> $target\n" if $verbose; set_link($target, $new); $nr_deletions++; } sub _wait_delete { return if $dry_run; lwarn "Do not run this in --parallel mode\n" if $child_prefix; finish_links(); return if !$nr_deletions; $nr_deletions = 0; for (;;) { my $del_link = "$mars/todo-global/deleted-$real_host"; my $deleted = get_link($del_link, 1); if (!$deleted) { return if !$compat_deletions; lwarn "Old deletions: '$del_link' does not exist\n"; sleep_timeout(); return; } $deleted =~ s/^0+//; my ($min_nr, $delete_nr) = _get_delete_nr("$mars/todo-global"); last if $deleted >= $delete_nr; lprint "waiting for deletions to apply locally....\n"; sleep_timeout(); } } sub delete_file_cmd { my $cmd = shift; my $res = shift; # ignore this foreach my $path (@_) { check_userspace($path); _create_delete($path); } } sub _purge_dot_deleted { my ($clean_glob, $min_time) = @_; my $nr_unlink = 0; foreach my $path (raw_glob($clean_glob)) { next unless -l $path; my $val = readlink($path); next unless $val eq ".deleted"; my $age = get_link_stamp($path); next unless $age; if ($age < $min_time) { $nr_unlink++; if ($dry_run) { lprint "DRY_RUN: would unlink '$path' (currently '$val')\n"; next; } lprint " remove deleted '$path'\n" if $verbose; unlink($path); } } if ($nr_unlink) { _reset_resources(); } } sub _get_min_time { my ($start_time, $res) = @_; my $min_time = $start_time; if (!$force) { my %status = get_alive_links($res, "time", "*"); foreach my $peer (keys(%status)) { my $stamp = $status{$peer}; $stamp = 0 if (!defined($stamp) || $stamp eq ""); $min_time = $stamp if $stamp < $min_time; } # protect against dead / decommissioned peers my $back_time = $start_time - 3600 * $keep_backup_hours; $min_time = $back_time if $back_time > $min_time; } $min_time -= $window; return $min_time; } # old deletion method, hopefully to disappear somewhen in future sub _compat_purge_deleted { my ($del_glob, $msg_glob) = @_; my $start_time = mars_time(); foreach my $leftlink (lamport_glob($del_glob)) { # remove outdated .deleted-* markers if ($leftlink =~ m:/.deleted-:) { my $info = get_link($leftlink, 2); if ($info =~ m/^[0-9]+,[0-9]+$/) { my ($serial, $mode) = split(/,/, $info); my $todo_glob = "$mars/todo-global/deleted-*"; if ($mode) { $leftlink =~ m:/resource-([^/]+)/:; my $res = $1; $todo_glob = "$mars/resource-$res/todo-$real_host/deleted-*" } # compute the minimum border my $border = 0; foreach my $deleted_link (lamport_glob($todo_glob)) { my $limit = get_link($deleted_link, 2); next if !$limit || $limit <= 1; $border = $limit if (!$border || $limit < $border); } if ($serial < $border && $serial > 1) { unlink($leftlink); next; } } } my $stamp = get_link_stamp($leftlink); next unless $stamp + 3600 * 24 < $start_time; lprint " unlink '$leftlink'\n" if $verbose; unlink($leftlink); } foreach my $leftlink (lamport_glob($msg_glob)) { # cleanup historic msg-* if ($leftlink =~ m:/msg-(conncetion-from-|additional-connection-):) { lprint " unlink deprecated '$leftlink'\n" if $verbose; unlink($leftlink); next; } my $stamp = get_link_stamp($leftlink); next unless $stamp + 3600 * 4 < $start_time; lprint " unlink '$leftlink'\n" if $verbose; unlink($leftlink); } } my $purged_globally = 0; sub link_purge_global { my ($cmd) = @_; return if $purged_globally++; my $start_time = mars_time(); my $min_time = _get_min_time($start_time, "*"); # new deletion method my $clean_glob = "$mars/{,*/}{.*,*}"; _purge_dot_deleted($clean_glob, $min_time); # SAFEGUARD, careful: /mars/ips/ does not depend on resources. # It belongs to cluster level, not to resource level. # However, even at cluster level, there are subtle distinctions: # I cannot assume that removal of a peer is a "temporary" removal. # At least, the current version of MARS does not support a "temporary" # removal, because this would induce a _plethora_ of further changes to # many reports / commands / interfaces / etc etc. # Thus we _cannot_ use the _get_min_time() protection against dead / decommissioned # peers here, UNFORTUNATLY :( # Reason: this protection can only protect at more fine-grained layers, but it # cannot protect the _base_ of all of this. # Example: if you destroy the _foundation_ of a building, you have agreed to # destruction of the _whole_ building. # For _some_ heuristic safety against _accidental_ destruction of the foundations # of the Distributed System: solely rely on $keep_backup_hours here, # but I cannot rely on non-brokenness == aliveness of connections (otherwise the # logically destroyed symlinks would NEVER disappear fully). my $safe_clean_glob = "/mars/ips/ip-*"; my $safe_purge_time = $start_time - 3600 * $keep_backup_hours; _purge_dot_deleted($safe_clean_glob, $safe_purge_time); # old deletion method # DEPRECATED, to DISAPPEAR (hopefully in a few years) my $del_glob = "$mars/{,*/}{.tmp,.deleted,delete,work}-*"; my $msg_glob = "$mars/actual-*/msg-*"; _compat_purge_deleted($del_glob, $msg_glob); # keep internal backups for at least 1 hour if ($keep_backup_hours < 1) { $keep_backup_hours = 1; } lprint "removing left-over symlinks...\n" if $verbose; # remove any old alivelinks get_global_versions(); my $kernel_compat = get_alive_link("compat-alivelinks", $host, 2); # only when a new kernel is running if (defined($kernel_compat) && $kernel_compat ne "") { my $keep_hours = 0; my $glob = "{alive,buildtag,emergency,features,rest-space,time,tree,usable,used,compat-alivelinks}"; # Notice: $kernel_compat might differ from $compat_alivelinks # For example, this can happen temporarily during join-cluster & co if (!$kernel_compat) { lprint "NEW alivelinks active: purging OLD alivelinks\n" if $verbose; $glob = "$mars/$glob-*"; } elsif ($compat_alivelinks && $kernel_strategy_version < 3) { # This should happen only after true downgrades. # Notice that newer kernels are writing _both_ variants # during compat mode. lprint "OLD alivelinks active: purging NEW alivelinks\n" if $verbose; $glob = "$mars/actual-$host/$glob*"; $keep_hours = $keep_backup_hours unless $force; } else { $glob = ""; } if ($glob) { foreach my $leftlink (lamport_glob($glob)) { if ($keep_hours) { my $stamp = get_link_stamp($leftlink); next unless $stamp + 3600 * $keep_hours < $start_time; } _create_delete($leftlink); } } } foreach my $leftlink (lamport_glob("$mars/backup*")) { my $stamp = get_link_stamp($leftlink); next unless $stamp + 3600 * $keep_backup_hours < $start_time; lprint " unlink '$leftlink'\n" if $verbose; system("rm -rf $leftlink"); } # remove any left-over probe dirs my $probe_glob = "$mars/probe-*"; foreach my $path (raw_glob($probe_glob)) { my $age = get_link_stamp($path); next unless ($age < $min_time); system("rm -rf $path"); } # remove any left-over systemd generator temporaries my $script_dir = "$etc_marsadm/$generated_scripts_subdir"; foreach my $dir ($systemd_target_dir, $script_dir) { my $glob = "$dir/.{pre,tmp}.*.tmp"; foreach my $leftlink (raw_glob($glob)) { my $mtime = get_link_stamp($leftlink); if ($mtime) { system("touch -h -d \@0 \"$leftlink\""); next; } unlink($leftlink); } } } sub link_purge_res { my ($cmd, $res) = @_; link_purge_global($cmd); my $start_time = mars_time(); my $min_time = _get_min_time($start_time, $res); # new deletion method my $clean_glob = "$mars/resource-$res/{,*/}{.*,*}"; _purge_dot_deleted($clean_glob, $min_time); # old deletion method my $del_glob = "$mars/resource-$res/{,*/}{.tmp,.deleted,delete,work}-*"; my $msg_glob = "$mars/resource-$res/actual-*/msg-*"; _compat_purge_deleted($del_glob, $msg_glob); } sub logdelete_res { my ($cmd, $res) = @_; lprint "removing left-over $res logfiles...\n" if $verbose; my $start_time = mars_time(); my @paths = lamport_glob("$mars/resource-$res/log-*") or ldie "cannot find any logfiles\n"; @paths = sort alphanum_cmp @paths; my ($min_deletable, $max_deletable) = _get_deletable_logfiles(@_); lprint "min deletable logfile number: $min_deletable\n" if $verbose; lprint "min non-deletable logfile number: $max_deletable\n" if $verbose; if ($min_deletable >= $max_deletable) { lprint "no logfiles are deletable.\n" if $verbose; return; } if ($cmd eq "log-delete-one") { $max_deletable = $min_deletable + 1; # delete only the first one } my $nr = 0; my $first = shift(@paths); for (;;) { last unless $first; my $next = shift(@paths); # never delete the very last logfile last unless $next; # safeguard: only delete logfiles having a minium age last if !$force && is_link_recent($first); $nr = $first; $nr =~ s/^.*log-([0-9]+)-.+$/$1/; next unless $nr < $max_deletable; lprint "chosen '$first' for deletion\n" if $verbose; _create_delete($first); $first = $next; } # Determine whether a parallel join-resource is ongoing my $transient_join = 0; foreach my $path (lamport_glob("$mars/resource-$res/device-*")) { my $val = get_link($path, 1); next unless $val; my $stamp = get_link_stamp($path); if ($val eq "(none)") { lwarn "Transient join-resource detected at '$path'\n"; $transient_join = 1; # Remove any transient links after a while if ($stamp < 10 ) { finish_links(); set_link($val, $path); finish_links($stamp + 1); } elsif ($stamp < 20) { _create_delete($path); } next; } next if is_member($res, $host); next unless is_guest($res, $host); # remove any inactive guests after backup retention period my $device_on_path = "$mars/resource-$res/actual-$host/if-on"; my $device_on_val = get_link($device_on_path, 1); my $device_on_stamp = get_link_stamp($device_on_path); $device_on_stamp = $stamp if $stamp > $device_on_stamp; # is the guest inactive? if (defined($device_on_val) && $device_on_val ne "" && !$device_on_val && $device_on_stamp > 20 && $device_on_stamp + $keep_backup_hours < $start_time) { lprint "Purging guest '$res'\n"; #_create_delete($path); } } my @versionlinks = lamport_glob("$mars/resource-$res/version-*"); # When join-resource is humming, or during split-brain, # don't remove versionlinks as far as possible. if (scalar(@versionlinks) < $max_deletions / 8 && ($transient_join || !detect_splitbrain($res, 1))) { lwarn "Keeping some versionlinks\n"; return unless $force; } lprint "Removing left-over versionlinks...\n" if $verbose; foreach my $versionlink (@versionlinks) { my $nrv = $versionlink; $nrv =~ s/^.*\/version-([0-9]+)-.+$/$1/; # we need at least one more version link than logfiles for consistency checks next unless $nrv < $max_deletable - 1; _create_delete($versionlink); } # remove outdated split-cluster remains my %peers; foreach my $path (lamport_glob("$mars/ips/ip-*")) { $path =~ m:/ip-(.*):; my $peer = $1; $peers{$peer} = 1; } foreach my $leftlink (lamport_glob("$mars/{,resource-$res/}todo-*/deleted-*")) { $leftlink =~ m:/deleted-(.*):; my $peer = $1; next if $peers{$peer}; my $stamp = get_link_stamp($leftlink); next unless $stamp + 3600 < $start_time; lprint " unlink '$leftlink'\n" if $verbose; unlink($leftlink); } } sub cron_phase1 { my ($cmd, $res) = @_; $cron_mode = 1; link_purge_res(@_); logrotate_res(@_); finish_links(); _trigger(3); return 0; } sub cron_phase2 { my ($cmd, $res) = @_; $cron_mode = 1; logdelete_res(@_); return 0; } my %protected_peers; sub get_protected_peers { my $ips_dir = "$mars/ips"; if (!scalar(keys(%protected_peers))) { foreach my $path (raw_glob("$ips_dir/ip-*")) { $path =~ m:/ip-(.*):; my $peer = $1; # do not protect any ill-formed peer names, by definition they need to be repaired. if (!$peer || $peer =~ $match_reserved_id) { lwarn "skipping invalid / illformed pathname '$path'\n"; next; } $protected_peers{$peer} = 1; } if (!scalar(keys(%protected_peers))) { lwarn "VERY DANGEROUS: the directory '$ips_dir' is EMPTY!!!"; } foreach my $peer (keys(%protected_peers)) { lprint "REAL PEER '$peer'\n"; } } if (!scalar(keys(%protected_peers))) { ldie "Cannot determine protected hostnames / peernames in $ips_dir - ABORTING for safety!"; } else { # ADDITIONAL SAFETY: don't forget any further information sources. if (!$protected_peers{$real_host}) { lwarn "Hopefully, the _reported_ real hostname '$real_host' (as reported by 'uname -a', NOT to be confused with a virtual hostname on the commandline via --host=\$other_name parameter) is REALLY correct.\n"; lwarn "DOUBLE-CHECK or TRIPLE-CHECK whether your configured hostname as reported by 'uname -a' is REALLY CORRECT.\n"; lwarn "Like anywhere else (not limited to MARS), ANY ERROR in the hostname configuration can easily create a DISASTER, up to DATA LOSS.\n"; ldie "POTENTIAL DISASTER: reported real host '$real_host' is missing in $ips_dir - ABORTING FOR SAFETY\n"; } if (!$protected_peers{$host}) { lwarn "DANGEROUS: host '$host' was missing in $ips_dir - CHECK BY HAND whether this is correct.\n"; $protected_peers{$host} = 1; lprint "ADD PEER '$host'\n"; } } # Finally, add _potentially_ _relevant_ peernames (not to be confused with _reported_ peernames). my @relevant_resources = get_member_resources($real_host); foreach my $relevant_res (@relevant_resources) { foreach my $path (raw_glob("$mars/resource-$relevant_res/replay-*")) { $path =~ m:/replay-(.*):; my $peer = $1; unless ($protected_peers{$peer}) { lwarn "PLEASE CHECK whether peer '$peer' or resource '$relevant_res' is SOMEHOW misconfigured.\n"; # CHECK: should this be added? # Some people might forget stray replaylinks in alive resources. # If I add this, some stray data might be kept forever, and sysadmins might be mis-informed # about its severity. # By _not_ activating this at the moment, the sysadmins will be responsible for checking # where the real problem is. # I cannot know this, for example when stone-aged hosts have been physically decommissioned forever. # OTOH the --autoclean option is expected to clean up anything which is "stray" and may be # "irritating". if (0) { $protected_peers{$peer} = 1; lprint "ADD PEER '$peer'\n"; } } } } } sub _autoclean_dir { my ($dir, $limit_stamp, $clean_full, $level) = @_; return if $level <= 0; get_protected_peers(); my $dir_stamp = get_stamp($dir); foreach my $path (raw_glob("$dir/*")) { # Some _global_ non-host-specific elements need to be protected _always_. # These can only deleted by filesystem destruction. next if $path =~ m:/(uuid|userspace|defaults|todo-global|ips)$:; if ($clean_full) { next if $path =~ m:/resource-:; next if $path =~ m:/lost\+found:; } else { # Keep protected elements. # Some non-host-specific elements need to be protected _always_. next if $path =~ m:/(primary|size|)$:; next if $path =~ m:/(log-|systemd):; # host-specific protections. my $found = 0; foreach my $peer (keys(%protected_peers)) { $found++ if $path =~ m:-$peer\Z:; } next if $found; } my $stamp = get_link_stamp($path); if (-d $path) { _autoclean_dir($path, $limit_stamp, $clean_full, $level - 1); if ($clean_full) { lprint "AUTOCLEAN RMDIR '$path'\n"; unless ($dry_run) { my $status = rmdir($path); my $txt = $!; if (!$status) { # do not report any backup dirs, they may have a very long lifetime if ($path !~ m:backup:) { lwarn "rmdir('$path') failed with status $status ($txt)\n"; } $dir_stamp = 0; } } } } elsif ($clean_full || -l $path) { if ($stamp > 0 && $stamp < $limit_stamp) { lprint "AUTOCLEANING '$path'\n"; unless ($dry_run) { my $status = unlink($path); if (!$status) { my $txt = $!; lwarn "unlink('$path') failed with status $status ($txt)\n"; $dir_stamp = 0; } } } } } if ($dir_stamp) { # Workaround a classical UNIX behaviour: # Reset the old mtime after successful autoclean. # Otherwise it could take another month until rmdir() can be successful. lprint "Resetting dir '$dir' to old timestamp '$dir_stamp'\n" if $verbose > 1; my $status = utime($dir_stamp, $dir_stamp, $dir); my $err = $!; if (1) { my $new_dir_stamp = get_stamp($dir); if ($new_dir_stamp != $dir_stamp) { lwarn "FAILED '$dir' '$dir_stamp' => '$new_dir_stamp' ($status '$err')\n"; } } } } sub autoclean_res { my ($cmd, $res, $after_days) = @_; if ($after_days <= 0) { lwarn "cannot $cmd: age $after_days days is lower than 1 day\n"; return; } my $resdir = "$mars/resource-$res"; return unless -d $resdir; lprint "autocleaning $res remains after $after_days days...\n" if $verbose; my $start_time = mars_time(); my $limit_stamp = $start_time - $after_days * 3600 * 24; get_protected_peers(); _autoclean_dir($resdir, $limit_stamp, 0, 3); lprint "autoclean $res done.\n" if $verbose; } sub cron_phase3 { my ($cmd, $res) = @_; return 0 unless $cron_autoclean_days > 0; _reset_resources(); autoclean_res($cmd, $res, $cron_autoclean_days); return 0; } sub autoclean_any { my ($cmd, $res, $after_days) = @_; if ($after_days <= 0) { lwarn "cannot $cmd: age $after_days days is lower than 1 day\n"; return; } # hard coded: at least 1 month ;) my $min_after_days = 30; $after_days = $min_after_days if $after_days = $min_after_days; my $start_time = mars_time(); my $limit_stamp = $start_time - $after_days * 3600 * 24; get_protected_peers(); foreach my $resdir (raw_glob("$mars/resource-*")) { my $protect_this = 0; my $protected_peer_list = ""; foreach my $peer (keys(%protected_peers)) { my $is_protected = scalar(raw_glob("$resdir/*-$peer")); if ($is_protected) { $protect_this = 1; $protected_peer_list .= "," if $protected_peer_list; $protected_peer_list .= $peer; } } if ($protect_this) { lprint "SKIPPING '$resdir' due to peers '$protected_peer_list'\n" if $verbose; next; } # Also check that everything is stone-aged ;) my $newest_stamp = 0; my $newest_element = ""; foreach my $path (raw_glob("$resdir/*")) { my $age = get_stamp($path); if ($age >= $limit_stamp) { $protect_this = 1; if ($age > $newest_stamp) { $newest_stamp = $age; $newest_element = $path; } } } if ($protect_this) { lprint "SKIPPING '$resdir' due to newest element '$newest_element' having age '$newest_stamp' (since the UNIX epoch)\n" if $verbose; next; } if ($force && !$dry_run) { lprint "FULLY AUTOCLEANING $resdir remains after $after_days days...\n"; system("rm -rf $resdir") unless $dry_run; } else { lprint "WOULD autoclean $resdir FULLY via rm -rf after $after_days days\n" if $verbose; } } # Also autoclean very old self-probe directories. # Why do I not clean them earlier? # They may function as a very old _emergency_ backup after some hardware went defective. # Only usable for _manual_ repair by _experts_. # In future, better usages might be possible. my $probe_dir = "$mars/probe-$real_host"; my $probe_dir_old = "$probe_dir.old"; if (-d $probe_dir_old) { system("rm -rf $probe_dir_old"); } elsif (-d $probe_dir) { my $age = get_stamp($probe_dir); if ($age < $limit_stamp) { rename($probe_dir, $probe_dir_old); } } # Also autoclean very old / stone-aged directories from ancient MARS versions. # Shame on you if you really would believe that you might need to downgrade to such obsolete versions. # They should not have been updated over the network anyway, and thus they cannot # contribute anything to current MARS. _autoclean_dir($mars, $limit_stamp, 1, 4); } sub cron_phase4 { my ($cmd, $res) = @_; return 0 unless $cron_autoclean_days > 0; lprint "======== EXTRA PHASE: GLOBAL AUTOCLEAN\n"; _reset_resources(); autoclean_any($cmd, "*", $cron_autoclean_days); _reset_resources(); lprint "GLOBAL AUTOCLEAN finished.\n"; return 0; } sub attach_res_phase0 { my ($cmd, $res) = @_; return 0 if $force; my $detach = ($cmd eq "detach"); if ($detach) { my $device_in_use = get_link("$mars/resource-$res/actual-$host/open-count", 1); if ($device_in_use) { my $want_path = "$mars/resource-$res/systemd-want"; my $want = get_link($want_path, 2); if ($want) { lprint "IMPORTANT: Relying on systemd for $cmd of resource '$res'\n"; my $path = "$mars/resource-$res/todo-$host/attach"; _switch($cmd, $res, $path, 0); finish_links(); systemd_any_trigger($cmd, $res); return 0; } my $dev = device_name($res); ldie "device '$dev' is in use\n"; } } return 0; } # only for systemd: wait that primary device is no longer open sub attach_res_phase0b { my ($cmd, $res) = @_; return 0 unless $cmd eq "detach"; return 0 unless systemd_present(@_); check_status($cmd, $res, "open-count", 0, 1); wait_cluster($cmd); return 0; } sub attach_res_phase1 { my ($cmd, $res) = @_; my $detach = ($cmd eq "detach"); my $path = "$mars/resource-$res/todo-$host/attach"; _switch($cmd, $res, $path, !$detach); return 0; } sub attach_res_phase2 { my ($cmd, $res) = @_; my $detach = ($cmd eq "detach"); return 0 if $force; if (!is_module_loaded()) { lwarn "Kernel module not loaded: $cmd will become effective after modprobe\n"; return 0; } check_status($cmd, $res, "is-attached", $detach ? 0 : 1, 1); if ($detach) { system("sync"); check_mars_device($cmd, $res, 1, 1); check_status($cmd, $res, "is-replaying", 0, 1); check_status($cmd, $res, "is-syncing", 0, 1); system("sync"); } return 0; } sub fetch_global_res { my ($cmd, $res) = @_; my $pause = ($cmd =~ m/disconnect|pause/); my @paths = lamport_glob("$mars/resource-$res/todo-*/"); for my $path (@paths) { _switch($cmd, $res, "$path/connect", !$pause); } } sub fetch_local_res { my ($cmd, $res) = @_; my $pause = ($cmd =~ m/disconnect|pause/); my $path = "$mars/resource-$res/todo-$host/connect"; _switch($cmd, $res, $path, !$pause); } sub pause_sync_global_res { my ($cmd, $res) = @_; my $pause = ($cmd =~ m/pause/); check_sync_startable(@_) if !$pause; my @paths = lamport_glob("$mars/resource-$res/todo-*/"); for my $path (@paths) { _switch($cmd, $res, "$path/sync", !$pause); } } sub pause_sync_local_res { my ($cmd, $res) = @_; my $pause = ($cmd =~ m/pause/); check_sync_startable(@_) if !$pause; my $path = "$mars/resource-$res/todo-$host/sync"; _switch($cmd, $res, $path, !$pause); } sub pause_replay_global_res { my ($cmd, $res) = @_; my $pause = ($cmd =~ m/pause/); my @paths = lamport_glob("$mars/resource-$res/todo-*/"); for my $path (@paths) { _switch($cmd, $res, "$path/replay", !$pause); } } sub pause_replay_local_res { my ($cmd, $res) = @_; my $pause = ($cmd =~ m/pause/); my $path = "$mars/resource-$res/todo-$host/replay"; _switch($cmd, $res, $path, !$pause); } sub up_res_phase0 { my ($cmd, $res) = @_; my $down = ($cmd eq "down"); if ($down) { attach_res_phase0("detach", $res); } else { attach_res_phase0("attach", $res); } return 0; } sub up_res_phase1 { my ($cmd, $res) = @_; my $down = ($cmd eq "down"); if ($down) { pause_replay_local_res("pause-replay-local", $res); pause_sync_local_res("pause-sync-local", $res); fetch_local_res("pause-fetch", $res); attach_res_phase1("detach", $res); } else { attach_res_phase1("attach", $res); fetch_local_res("resume-fetch-local", $res); # ignore ldie on sync, just do all the rest eval { pause_sync_local_res("resume-sync-local", $res); }; pause_replay_local_res("resume-replay-local", $res); } return 0; } sub up_res_phase2 { my ($cmd, $res) = @_; my $down = ($cmd eq "down"); if ($down) { attach_res_phase2("detach", $res); } else { attach_res_phase2("attach", $res); } return 0; } sub set_replay_res { my ($cmd, $res, $new_nr) = @_; if (!$new_nr || $new_nr <= 0) { ldie "you must supply a numeric logfile number as third argument.\n"; } check_not_primary($cmd, $res); check_todo($cmd, $res, "replay", 0, 0); my $replaylink = "$mars/resource-$res/replay-$host"; my $old_val = get_link($replaylink); my $old_nr = $old_val; $old_nr =~ s/log-([0-9]+)-.*/$1/; ldie "old log number '$old_nr' is wrong\n" unless $old_nr > 0; if ($new_nr > $old_nr) { lwarn "you try to skip logfile numbers from $old_nr to $new_nr, are you sure?\n"; ldie "you would need --force if you really know what you are doing.\n" unless $force; } my $primary = _get_designated_primary($res); _set_replaylink($cmd, $res, $new_nr, $primary); } sub fake_sync_phase1 { my ($cmd, $res) = @_; my $path = "$mars/resource-$res/todo-$host/sync"; _switch($cmd, $res, $path, 0); return 0; } sub fake_sync_phase2 { my ($cmd, $res) = @_; check_status($cmd, $res, "is-syncing", 0, 1); return 0; } sub fake_sync_phase3 { my ($cmd, $res) = @_; my $size = get_link("$mars/resource-$res/size"); my $target = "$mars/resource-$res/syncstatus-$host"; set_link($size, $target); return 0; } sub _primary_res { my ($res, $new, $old) = @_; my $pri = "$mars/resource-$res/primary"; finish_links(); my $old_stamp = get_link_stamp($pri); set_link($new, $pri); if ($new eq "(none)") { # Allow overriding of secondaries in a partitioned cluster # by using a weaker timestamp. # When somebody issues a (more or less) _concurrent_ "primary" command # during a network partition, let the "primary" win over the "secondary". # Notice: the definition of "concurrent" in the sense of Lamport # (where _any_ communication may take "arbitrarily" long) # may be non-intuitive to humans in the presence of a network outage. finish_links($old_stamp + 1) if $old_stamp; } else { my $last_lnk = "$mars/resource-$res/userspace/last-primary"; my $prev_lnk = "$mars/resource-$res/userspace/prev-primary"; system("mv -f $last_lnk $prev_lnk"); set_link($new, $last_lnk); } lprint "designated primary changed from '$old' to '$new'\n"; } # check whether primary/secondary switching is possible at all sub primary_phase0 { my ($cmd, $res) = @_; ldie "cannot switch primary: mars kernel module is not loaded\n" unless ($cmd eq "secondary" || -d "/proc/sys/mars"); if ($force) { check_todo($cmd, $res, "fetch", 0, 0); } my $old = _get_designated_primary($res, -1); lprint "Current designated primary: $old\n"; if ($cmd eq "primary") { if ($host ne $old) { lprint "Allowing handover in cases of sync: ignore_sync=$ignore_sync\n" if $ignore_sync; check_sync_finished($res, $host, $ignore_sync); # also check that other secondaries won't loose their sync primary my @names = lamport_glob("$mars/resource-$res/data-*"); # for k <= 2 replicas, the previous check must have been sufficient if (scalar(@names) > 2) { my $allow_anyway = ($force || $ignore_sync); lprint "Allowing handover in cases of sync: force=$force ignore_sync=$ignore_sync\n" if $allow_anyway; foreach my $name (@names) { $name =~ m:/data-(.+):; my $peer = $1; next if ($peer eq $old || $peer eq $host); # Only abort when peer is known to be alive. # Otherwise any problem should spit warnings like hell. my $alive_stamp = get_alive_link("time", $peer); my $allow_this = (!defined($alive_stamp) || $alive_stamp eq "" || !is_recent($alive_stamp)); check_sync_finished($res, $peer, $allow_anyway || $allow_this); } } } check_todo($cmd, $res, "attach", 1, 0); check_todo($cmd, $res, "fetch", 1, 0) if !$force; check_todo($cmd, $res, "replay", 1, 0); # check that no logfile replay errors exist. my $replay_error = get_link("$mars/resource-$res/actual-$host/replay-code", 2); if (defined($replay_error) && $replay_error ne "" && int($replay_error) < 0) { lwarn "Logfile replay / recovery stopped with error code $replay_error.\n"; ldie "Won't switch to avoid unnoticed data loss. You may however do a 'primary --force'.\n" unless $force; } } my $want_path = "$mars/resource-$res/systemd-want"; my $want = get_link($want_path, 2); if ($want) { my $new; my $oper; if ($cmd eq "primary") { $new = $host; $oper = "start"; } else { $new = "(none)"; $oper = "stop"; } set_link($new, $want_path); my $unit_path = "$mars/resource-$res/systemd-$oper-unit"; my $unit = get_link($unit_path, 2); if ($unit) { if ($old ne "(none)") { report_systemd_status($cmd, $res, 0, "stop", $old); } lprint "IMPORTANT: Relying on systemd for $oper of unit '$unit'\n"; lprint "IMPORTANT: unit '$unit' wanted at '$new'\n"; finish_links(); _trigger(3); return 0; } } return 0 if ($old eq $host and $cmd eq "primary"); return 0 if $old eq "(none)"; my $open_count_path = "$mars/resource-$res/actual-$old/open-count"; my $device_in_use = get_link($open_count_path, 1); if ($device_in_use) { my $dev = device_name($res, $old); lwarn "device '$dev' for resource '$res' is $device_in_use times in use on primary host '$old'\n"; ldie "first you must umount/close the device (on host '$old')\n" unless $force; lwarn "First you SHOULD umount/close the device (on host '$old'), but you ignore this recommendation by giving the --force option.\n"; if (is_link_recent($open_count_path)) { lwarn "You are forcing a SPLIT BRAIN via --force right now. Do you know that this is an ERRONEOUS state? Do you really know what you are doing?\n"; } else { lwarn "You may produce a SPLIT BRAIN via --force because the peer host '$old' is currently not reachable.\n"; } } lprint "all preconditions OK for resource '$res'\n"; return 0; } # only for primary --force: wait until fetch has actually stopped sub primary_phase0a { my ($cmd, $res) = @_; my $new = $host; if (!$force && $cmd =~ m/primary/) { lprint "Prepare new primary '$new' handover\n"; _switch($cmd, $res, "$mars/resource-$res/todo-$new/fetch", 1); } finish_links(); return 0 unless $force; wait_cond($cmd, $res, "is-fetch-off"); return 0; } # only for systemd: wait that primary device is no longer open sub primary_phase0b { my ($cmd, $res) = @_; return 0 unless systemd_present(@_); # only relevant for true handover my $old = _get_designated_primary($res, -1); return 0 if ($old eq $host || $old eq "(none)"); # ignore primary --force my $connect_path = "$mars/resource-$res/todo-$host/connect"; my $connect = get_link($connect_path, 1); return 0 if (!defined($connect) || !$connect); # Notice: this is a workaround for a problem which is # outside of our scope. For example, a remote umount will # fail when any (delayed) process has any filehandle open. # As long as the umount was unsuccessful, we _cannot_ proceed # with planned handover. # As a workaround, we constantly trigger the remote systemd # in the _hope_ that the umount will succeed, and the # open-count will then go down to zero, hopefully somewhen. my $watch = "$mars/resource-$res/systemd-want"; my $action = ""; if (link_exists($watch)) { $action = "system(\"touch -h $watch\");"; my $response_path = "$mars/resource-$res/userspace/systemd-status-stop-$old"; $action .= "\$action_status = get_link(\"$response_path\");"; my $msg = "systemctl stop on peer $old: status=\$action_status\n"; $action .= "ldie \"$msg\" if \$action_status;"; } # try to compensate failures by systemd restart my $compensation = undef; if ($old ne "(none)") { $compensation = "lprint \"Restarting '$res' on '$old'\n\"; "; $compensation .= "_primary_res(\"$res\", \"$old\", \"(none)\"); "; $compensation .= "set_systemd_want_phase1(\"$cmd\", \"$res\", \"$old\"); "; $compensation .= "finish_links(); "; $compensation .= "_trigger(3); "; $compensation .= "lprint \"Triggered systemd at '$old'.\n\";"; } check_status($cmd, $res, "open-count", 0, 1, undef, undef, $old, $action, $compensation); return 0; } # when necessary, switch to secondary (intermediately) sub primary_phase1 { my ($cmd, $res) = @_; return 0 if ($force and $cmd eq "primary"); my $old = _get_designated_primary($res, -1); return 0 if ($old eq $host and $cmd eq "primary"); my $new = "(none)"; if (!$force and $cmd eq "primary") { my $status = try_to_avoid_splitbrain($cmd, $res, $old); return $status if $status; } return 0 if $old eq $new; _primary_res($res, $new, $old); return 0; } my $phase2_waited = 0; sub primary_phase1b { my ($cmd, $res) = @_; $phase2_waited = 0; finish_links(); return 0 if $force; my $old = _get_designated_primary($res, -1); my $status = check_primary_gone($cmd, $res, $old); return $status if $status; if (!$force and $cmd eq "primary") { my $status = try_to_avoid_splitbrain($cmd, $res, $old); return $status if $status; } return 0; } # when necessary, wait sub primary_phase2 { my ($cmd, $res) = @_; return 0 if $force; return 0 unless $cmd eq "primary"; wait_cluster($cmd) if !$phase2_waited++; my $old = _get_designated_primary($res, -1); return check_primary_gone($cmd, $res, $old); } sub primary_phase2b { my ($cmd, $res) = @_; return 0 if $force; if (systemd_present(@_)) { my $old = _get_designated_primary($res, -1); return try_to_avoid_splitbrain($cmd, $res, $old); } return 0; } # when necessary, switch to primary sub primary_phase3 { my ($cmd, $res) = @_; return 0 unless $cmd eq "primary"; my $old = _get_designated_primary($res, -1); my $new = $host; _primary_res($res, $new, $old); return 0; } sub primary_phase3b { finish_links(); return 0; } # wait for device to appear / disappear sub primary_phase4 { my ($cmd, $res) = @_; if($cmd eq "secondary") { check_mars_device($cmd, $res, 1, 1); return 0; } my $ok = detect_splitbrain($res, 1); if (!$ok) { lwarn "\n"; lwarn "Sorry, in split brain situations I can only set the _designated_\n"; lwarn "primary, but I cannot _guarantee_ that becoming the\n"; lwarn "_actual_ primary is always possible.\n"; lwarn "You SHOULD resolve the split brain ASAP (e.g. by leave-resource\n"; lwarn "or invalidate etc).\n"; lwarn "\n"; lwarn "If you already tried to resolve the split brain manually, but\n"; lwarn "this message does not disappear, the reason could be some\n"; lwarn "hindering left-overs/remains from the former split brain.\n"; lwarn "ONLY in such a case, try log-purge-all --force.\n"; lwarn "\n"; return 0; } check_mars_device($cmd, $res, 1, 0); # new switch semantics, when nothing has failed before: up up_res_phase1(@_); return 0; } sub primary_phase5 { return 0 unless systemd_present(@_); return set_systemd_want_phase2(@_); } sub wait_umount_res { my ($cmd, $res) = @_; my $path = "$mars/resource-$res/actual-$host/open-count"; while (1) { my $sum = get_link($path); last if !$sum; lprint "device for resource '$res' is $sum times in use on $host\n"; sleep_timeout(1); } lprint "OK, device for resource '$res' is not in use.\n"; } sub invalidate_res_phase0 { my ($cmd, $res) = @_; my $nr_retry = 0; my $status; retry: check_not_primary($cmd, $res); my $primary = _get_designated_primary($res); ldie "for operation '$cmd', some other designated primary must exist (currently there is none)\n" if $primary eq "(none)"; ldie "Cannot invalidate the designated primary host '$primary'\n" if $primary eq $host; my $emergency_path = "$mars/resource-$res/actual-$primary/has-emergency"; my $emergency = get_link($emergency_path, 1); if ($emergency) { lwarn "Primary '$primary' is in emergency mode. Cannot invalidate '$res'.\n"; $nr_retry++; sleep($nr_retry); if ($nr_retry <= 5) { update_cluster($cmd, $res, $primary); goto retry; } lwarn "Primary '$primary' is in emergency mode. Cannot invalidate '$res'.\n"; } return 0; } sub invalidate_res_phase1 { my ($cmd, $res) = @_; _switch($cmd, $res, "$mars/resource-$res/todo-$host/attach", 0); _switch($cmd, $res, "$mars/resource-$res/todo-$host/sync", 0); _switch($cmd, $res, "$mars/resource-$res/todo-$host/fetch", 0); _switch($cmd, $res, "$mars/resource-$res/todo-$host/replay", 0); return 0; } sub invalidate_res_phase2 { my ($cmd, $res) = @_; if (!$force) { check_status($cmd, $res, "is-syncing", 0, 1); check_status($cmd, $res, "is-fetching", 0, 1); check_status($cmd, $res, "is-replaying", 0, 1); check_status($cmd, $res, "is-attached", 0, 1); } return 0; } sub invalidate_res_phase3 { my ($cmd, $res) = @_; my $dst = "$mars/resource-$res/syncstatus-$host"; my $primary = _get_designated_primary($res); ldie "Cannot execute 'invalidate' because noone is designated as primary.\n" if (!$primary || $primary eq "(none)"); ldie "Cannot invalidate the designated primary host '$primary'\n" if $primary eq $host; my $replay = get_link("$mars/resource-$res/replay-$primary"); $replay =~ m/^log-([0-9]+)-/ or ldie "replay link '$replay' is not parsable\n"; my $replay_nr = $1; set_link("0", $dst); finish_links(); # opportunity for errors => don't continue for my $vers_path (lamport_glob("$mars/resource-$res/version-*-$host")) { $vers_path =~ m:/version-([0-9]+):; my $this_nr = $1; _create_delete($vers_path) if $this_nr >= $replay_nr; } _create_delete("$mars/resource-$res/replay-$host"); finish_links(); return 0; } sub invalidate_res_phase4 { my ($cmd, $res) = @_; _wait_delete(); return 0; } sub invalidate_res_phase5 { my ($cmd, $res) = @_; $force = 0; # this would be too dangerous log_purge_res(@_); err_purge_res(@_); return 0; } sub invalidate_res_phase6 { my ($cmd, $res) = @_; _wait_delete(); return 0; } sub invalidate_res_phase7 { my ($cmd, $res) = @_; my $dst = "$mars/resource-$res/syncstatus-$host"; my $primary = _get_designated_primary($res); ldie "Cannot execute 'invalidate' because noone is designated as primary.\n" if (!$primary || $primary eq "(none)"); ldie "Cannot invalidate the designated primary host '$primary'\n" if $primary eq $host; my $replay = get_link("$mars/resource-$res/replay-$primary"); $replay =~ m/^log-([0-9]+)-/ or ldie "replay link '$replay' is not parsable\n"; my $replay_nr = $1; _set_replaylink($cmd, $res, $replay_nr, $primary, ""); finish_links(); return 0; } sub invalidate_res_phase8 { my ($cmd, $res) = @_; _wait_delete(); return 0; } sub invalidate_res_phase9 { my ($cmd, $res) = @_; my $dst = "$mars/resource-$res/syncstatus-$host"; my $primary = _get_designated_primary($res); ldie "Cannot execute 'invalidate' because noone is designated as primary.\n" if (!$primary || $primary eq "(none)"); ldie "Cannot invalidate the designated primary host '$primary'\n" if $primary eq $host; _switch($cmd, $res, "$mars/resource-$res/todo-$host/attach", 1); _switch($cmd, $res, "$mars/resource-$res/todo-$host/fetch", 1); _switch($cmd, $res, "$mars/resource-$res/todo-$host/replay", 1); finish_links(); _switch($cmd, $res, "$mars/resource-$res/todo-$host/sync", 1); finish_links(); return 0; } my %resize_device_size; my %resize_old_size; my %resize_new_size; sub get_possible_size { my ($cmd, $res) = @_; my @actsizes = lamport_glob("$mars/resource-$res/actsize-*"); lwarn "resource $res has no actsize-* symlinks\n" unless @actsizes; my $possible_size = 0; foreach my $actsize (@actsizes) { my $this_size = get_link($actsize); if (!$possible_size || $this_size < $possible_size) { $possible_size = $this_size; } } return $possible_size; } sub resize_phase0 { my ($cmd, $res, $size_arg) = @_; ldie "mars kernel module is not loaded. This is needed for communication with some other hosts!\n" if !is_module_loaded(); my $new_size = 0; if ($size_arg) { $new_size = get_size($size_arg); ldie "optional size argument '$new_size' must be numeric and positive\n" unless scalar($new_size) > 0; lprint "new size: $new_size bytes\n"; } check_primary($cmd, $res, 1); my $my_size = get_link("$mars/resource-$res/actsize-$host"); my $lnk = "$mars/resource-$res/size"; my $old_size = get_link($lnk); lprint "old_size=$old_size\n"; my $possible_size = get_possible_size($cmd, $res); lprint "possible_size=$possible_size\n"; $new_size = $possible_size if !$new_size; lprint "new_size=$new_size\n"; ldie "new size $new_size is higher than the possible size (minimum of all volumes) $possible_size" if $new_size > $possible_size; # no override with --force possible # disallow decreasing ldie "only increases of the size are possible!\n" if $new_size < $old_size; my $waste = $my_size - $new_size; lwarn "You are wasting $waste bytes locally\n" if $my_size > $new_size; # remember values $resize_device_size{$res} = _get_mars_size(@_); $resize_old_size{$res} = $old_size; lwarn "internal mismatch between actual device size and resource size: $resize_device_size{$res} != $resize_old_size{$res}\n" unless $resize_device_size{$res} == $resize_old_size{$res}; $resize_new_size{$res} = $new_size; return 0; } sub resize_phase1 { my ($cmd, $res) = @_; my $old_size = $resize_old_size{$res} or ldie "bad internal size value\n"; my $new_size = $resize_new_size{$res} or ldie "bad internal size value\n"; # for safety, check again ldie "only increases of the size are possible!\n" if $new_size < $old_size; check_primary($cmd, $res, 1); # Mark the primary data / its size as authoritative my $act_lnk = "$mars/resource-$res/syncstatus-$host"; set_link($new_size, $act_lnk); finish_links(); # Chance for errors to pop up # Now set the new resource size my $lnk = "$mars/resource-$res/size"; set_link($new_size, $lnk); finish_links(); return 0; } sub resize_phase2 { my ($cmd, $res) = @_; my $old_size = $resize_old_size{$res} or ldie "bad internal size value\n"; my $new_size = $resize_new_size{$res} or ldie "bad internal size value\n"; for (;;) { my $new_device_size = _get_mars_size(@_); if ($new_device_size == $resize_new_size{$res}) { lprint "Device size is now $new_device_size.\n"; last; } lprint "Device size $new_device_size has not yet reached the new size $resize_new_size{$res}.\n"; if ($new_device_size != $resize_device_size{$res}) { lwarn "The size has changed, but did not reach the correct value."; lwarn "Assuming some rounding problems (which may occur at some device types)\n"; last; } sleep_timeout(); } return 0; } # Remark: this is historic, and should not be used anymore. # However, it likely needs to be kept for DRBD similarity. sub role_cmd { my ($cmd, $res, $peer) = @_; $peer = $host unless (defined($peer) && $peer); my $is_primary = is_actual_primary($cmd, $res, $peer); my $todo_primary = _get_designated_primary($res); my $msg = "I am actually "; $msg .= $is_primary ? "primary" : "secondary"; if ($todo_primary ne $peer) { $msg .= " and $todo_primary should be primary"; } lprint $msg . "\n"; } # Deprecated. Check whether it can be removed. sub mars_state_cmd { my ($cmd, $res, $peer) = @_; $peer = $host unless (defined($peer) && $peer); my $is_primary = is_actual_primary($cmd, $res, $peer); my $todo_primary = _get_designated_primary($res); if ($is_primary) { lprint "is_primary\n"; return; } if ($todo_primary eq $peer) { lprint "becoming_primary\n"; return; } # secondary without ambitions to become primary my $size = get_link("$mars/resource-$res/size"); my $syncstatus = get_link("$mars/resource-$res/syncstatus-$peer"); if ($syncstatus != $size) { lprint "secondary inconsistent ($syncstatus bytes of $size)\n"; return; } if ($todo_primary eq "(none)") { lprint "secondary\n"; return; } my $primary_replay = get_link("$mars/resource-$res/replay-$todo_primary"); my $peer_replay = get_link("$mars/resource-$res/replay-$peer"); if ($primary_replay eq $peer_replay) { lprint "secondary uptodate\n"; return; } lprint "secondary outdated ($peer_replay instead of $primary_replay)\n"; } sub cat_cmd { my $cmd = shift; foreach my $path (@_) { _get_text($path, undef, 1); } } sub mars_info_cmd { my ($cmd, $res) = @_; my $info = "$mars/resource-$res/logstatus-$host.status"; cat_cmd($cmd, $info); } sub show_cmd { my ($cmd, $res) = @_; $res = "*" if !$res || $res eq "all"; $res = "{$res}" if $res =~ m/,/; my $glob = "$mars/{ips/ip-$host,alive-$host,emergency-$host,rest-space-$host,resource-$res/{device,primary,size,actsize-$host,syncstatus-$host,replay-$host,actual-$host/*,todo-$host/*}}"; foreach my $link (lamport_glob($glob)) { next unless link_exists($link); my $res = get_link($link); my $short = $link; $short =~ s:^$mars/::; lprint "$short=$res\n"; } } sub show_errors_cmd { my ($cmd, $res) = @_; my $text = get_error_text(@_); if ($text) { lprint $text; ldie "resource $res has some (old) problems.\n"; } else { lprint "no errors/warnings are reported.\n"; } } sub version { lprint "$0 Version: $marsadm_version $Id\n"; #lprint "my IP is $ip\n"; exit 0; } ################################################################## # pretty printing sub seconds2human { my $seconds = shift; return "--:--:--" unless (defined($seconds) && $seconds ne "" && $seconds >= 0); return sprintf("%02d:%02d:%02d", $seconds / 3600, ($seconds % 3600) / 60, $seconds % 60); } sub number2human { my ($number, $unit, $max) = @_; $max = $number if !defined($max); my $k = 1024.0; my $flip_over = 9.99; if (!defined($unit) || !$unit) { if ($max >= $k * $k * $k * $k * $flip_over) { $unit = "T"; } elsif ($max >= $k * $k * $k * $flip_over) { $unit = "G"; } elsif ($max >= $k * $k) { $unit = "M"; } elsif ($max >= $k) { $unit = "K"; } else { $unit = ""; } } my $i = "i"; if ($unit =~ m/^[a-z]/) { $i = ""; $k = 1000.0; } $_ = $unit; SWITCH: { if (/t/i) { $number /= $k * $k * $k * $k; $unit = "T${i}B"; last SWITCH; } if (/g/i) { $number /= $k * $k * $k; $unit = "G${i}B"; last SWITCH; } if (/m/i) { $number /= $k * $k; $unit = "M${i}B"; last SWITCH; } if (/k/i) { $number /= $k; $unit = "K${i}B"; last SWITCH; } $unit = "B"; } if ($unit eq "B") { $number = int($number + 0.1); } else { $number = sprintf("%.3f", $number); } return "$number $unit"; } sub progress_bar { my ($length, $min, $mid, $max) = @_; $min = 0 if $min < 0; $mid = $min if $mid < $min; $max = $mid if $max < $mid; $max = 1 if $max < 1; my $pos1 = $length * $min / $max; my $count = $pos1 - 1; $count = 0 if $count < 0; my $bar = '=' x $count; if ($pos1 < $length) { $bar .= ">"; $pos1++; } my $pos2 = $length * $mid / $max; if ($pos1 < $pos2) { $bar .= ':' x ($pos2 - $pos1); } if ($pos2 < $length) { $bar .= '.' x ($length - $pos2); } return "[$bar]"; } ################################################################## # macro evaluation sub make_numeric { my $number = shift; return 0 if (!defined($number) || $number eq ""); return $number; } sub set_args { my $outer_env = shift; my $inner_env = shift; $$inner_env{"callstack"} .= "," if $$inner_env{"callstack"}; $$inner_env{"callstack"} .= ${_[0]}; my $index = 0; while (defined(my $next = shift)) { my $arg = parse_macro($next, $outer_env); $$inner_env{$index++} = $arg; } # clear all other number variables to avoid confusion while (defined($$inner_env{$index})) { undef $$inner_env{$index++}; } } # evaluate a primitive macro sub eval_fn { my $env = shift; my $fn = shift; # optionally allow primitive macros without prefix primitive- to be substituted if ($fn !~ s/^primitive[-_]?//) { my $macro = get_macro($fn, 1); if ($macro) { set_args($env, $env, $fn, @_); return parse_macro($macro, $env); } } my $arg1 = shift; $_ = $fn; SWITCH: { if (/^$/) { # variable my $varname; # prefix *crement operators if ($arg1 =~ m/^([-+]{2})$/) { my $op = $arg1; my $arg1 = shift; $varname = parse_macro($arg1, $env); if (!defined($varname)) { lwarn "macro processor: prefix operator '$op': varname '$arg1' sub-expansion yields 'undef'\n"; $varname = ""; } elsif ($varname eq "") { lwarn "macro processor: prefix operator '$op': varname sub-expanded from argument '$arg1' is empty\n"; } if ($op =~ m/^\+/) { $$env{$varname}++; } else { $$env{$varname}--; } } else { $varname = parse_macro($arg1, $env); if (!defined($varname)) { lwarn "macro processor: varname argument '$arg1' sub-expansion yields 'undef'\n"; $varname = ""; } elsif ($varname eq "") { lwarn "macro processor: varname sub-expanded from argument '$arg1' is empty\n"; } } my $result = ""; if (defined($$env{$varname})) { $result = $$env{$varname}; } # postfix *crement operators if (defined(${_[0]}) && ${_[0]} =~ m/^([-+]{2})$/) { my $op = shift; if ($op =~ m/^\+/) { $$env{$varname}++; } else { $$env{$varname}--; } } # provisionary light-weight arrays based on CSV format if (defined(my $delim = shift) && defined(my $index = shift)) { $delim = parse_macro($delim, $env); $index = parse_macro($index, $env); my @list = split($delim, $result); # get last element when denoted $index = scalar(@list) - 1 if ($index eq "" || $index eq "*"); $result = $list[$index]; } if (!defined($result)) { lwarn "macro processor: expansion of variable '$varname' yields 'undef'\n"; } return $result; } if (/^let$/) { # assignment my $varname = parse_macro($arg1, $env); my $arg2 = shift; my $value = parse_macro($arg2, $env); $$env{$varname} = $value; return ""; } if (/^append$/) { # .= my $varname = parse_macro($arg1, $env); my $arg2 = shift; my $value = parse_macro($arg2, $env); $$env{$varname} .= $value; return ""; } if (/^set$/) { # provisionary light-weight arrays based on CSV format my $varname = parse_macro($arg1, $env); my $delim = shift; $delim = parse_macro($delim, $env); my $index = shift; $index = parse_macro($index, $env); my @list = split($delim, $$env{$varname}); # append to list when denoted $index = scalar(@list) if ($index eq "" || $index eq "*"); my $value = shift; $value = parse_macro($value, $env); $list[$index] = $value; $$env{$varname} = join($delim, @list); return ""; } if (/^dump[-_]?vars$/) { # write to stderr foreach my $key (sort alphanum_cmp keys(%$env)) { next if $key =~ m/^__.*__$/; my $val = $$env{$key}; lprint_stderr "$key='$val'\n"; } return ""; } if (/^([-+*\/%&|^]|>>|<<|min|max)$/) { # arithmetic / associative operators my $op = $1; my $number = make_numeric(parse_macro($arg1, $env)); while (defined(my $next = shift)) { my $operand = make_numeric(parse_macro($next, $env)); $_ = $op; if (/^\+$/) { $number += $operand; next; } if (/^-$/) { $number -= $operand; next; } if (/^\*$/) { $number *= $operand; next; } if (/^\/$/) { $number /= $operand; next; } if (/^%$/) { $number %= $operand; next; } if (/^&$/) { $number &= $operand; next; } if (/^\|$/) { $number |= $operand; next; } if (/^\^$/) { $number ^= $operand; next; } if (/^<<$/) { $number <<= $operand; next; } if (/^>>$/) { $number >>= $operand; next; } if (/^min$/) { $number = $operand if $number < $operand; next; } if (/^max$/) { $number = $operand if $number > $operand; next; } ldie "bad arithmetic operator '$op'"; } return $number; } if (/^([<>]=?|[!=]=)$/) { # numeric comparisons my $op = $1; my $n1 = make_numeric(parse_macro($arg1, $env)); my $arg2 = shift; my $n2 = make_numeric(parse_macro($arg2, $env)); $_ = $op; if (/^<$/) { return $n1 < $n2; } if (/^>$/) { return $n1 > $n2; } if (/^<=$/) { return $n1 <= $n2; } if (/^>=$/) { return $n1 >= $n2; } if (/^==$/) { return $n1 == $n2; } if (/^!=$/) { return $n1 != $n2; } ldie "bad comparison operator '$op'"; } if (/^(lt|gt|le|ge|eq|ne|match|=~)$/) { # binary string operators my $op = $1; $op = "=~" if $op eq "match"; my $n1 = parse_macro($arg1, $env); my $arg2 = shift; my $n2 = parse_macro($arg2, $env); $_ = $op; if (/^lt$/) { return $n1 lt $n2; } if (/^gt$/) { return $n1 gt $n2; } if (/^le$/) { return $n1 le $n2; } if (/^ge$/) { return $n1 ge $n2; } if (/^eq$/) { return $n1 eq $n2; } if (/^ne$/) { return $n1 ne $n2; } if (/^=~$/) { my $opts = "m"; my $arg3 = shift; $opts = parse_macro($arg3, $env) if defined($arg3); # unfortunately standard regex operators don't seem to accept variable options, so we use eval() my $result = eval("\$n1 =~ m{$n2}$opts"); return "" unless defined($result); return $result; } ldie "bad binary operator '$op'"; } if (/^(&&|\|\||and|or)$/) { # shortcut operators my $op = $1; $op = "&&" if $op eq "and"; $op = "||" if $op eq "or"; my $number = parse_macro($arg1, $env); while (my $next_arg = shift) { last unless defined($next_arg); if ($op eq "&&") { return 0 if !$number; } if ($op eq "||") { return 1 if $number; } my $operand = parse_macro($next_arg, $env); if ($op eq "&&") { $number &&= $operand; next; } if ($op eq "||") { $number ||= $operand; next; } ldie "bad shortcut operator '$op'"; } return $number; } if (/^([~!]|not)$/) { # unary operators my $op = $1; $op = "!" if $op eq "not"; my $number = parse_macro($arg1, $env); $_ = $op; if (/^~$/) { return ~$number; } if (/^!$/) { return !$number; } ldie "bad unary operator '$op'"; } # string functions if (/^toupper$/) { my $txt = parse_macro($arg1, $env); return uc($txt); } if (/^tolower$/) { my $txt = parse_macro($arg1, $env); return lc($txt); } if (/^length$/) { # string length my $txt = parse_macro($arg1, $env); return length($txt); } if (/^subst$/) { # regex substitution operator my $txt = parse_macro($arg1, $env); my $arg2 = shift; my $regex = parse_macro($arg2, $env); my $arg3 = shift; my $subst = parse_macro($arg3, $env); my $opts = "m"; my $arg4 = shift; $opts = parse_macro($arg4, $env) if defined($arg4); # unfortunately standard regex operators don't seem to accept variable options, so we use eval() eval("\$txt =~ s{$regex}{$subst}$opts"); return $txt; } if (/s?printf$/) { # sprintf() my $fmt = $arg1; # exception: don't evaluate the format string! my @list = (); while (defined(my $next = shift)) { my $operand = parse_macro($next, $env); push @list, $operand; } return sprintf($fmt, @list); } if (/^human[-_]?numbers?$/) { # convert numbers to readable format my $name = $_; my $unit = parse_macro($arg1, $env); my $arg2 = shift; my $delim_numbers = parse_macro($arg2, $env); $delim_numbers = "/" if $delim_numbers eq ""; my $arg3 = shift; my $delim_unit = parse_macro($arg3, $env); $delim_unit = " " if $delim_unit eq ""; my $max = 0; my @list = (); while (defined(my $next = shift)) { my $number = make_numeric(parse_macro($next, $env)); push @list, $number; $max = $number if $number > $max; } lwarn "macro $name: no number arguments given\n" unless @list; my $conv_unit = ""; my @results = (); foreach my $number (@list) { my $conv_number = number2human($number, $unit, $max); $conv_number =~ s/ *([a-z].*)//i; $conv_unit = $1; push @results, $conv_number; } return join($delim_numbers, @results) . "$delim_unit$conv_unit"; } if (/^human[-_]?seconds$/) { # convert numbers to readable format # don't use make_numeric() here in order to allow the result '--:--:--' my $number = parse_macro($arg1, $env); return seconds2human($number); } if (/^progress$/) { # progress bar my $length = make_numeric(parse_macro($arg1, $env)); $length = 20 unless ($length && $length > 0); my $arg2 = shift; if (!defined($arg2)) { # use default values my @vector = split(':', eval_fn($env, "summary-vector", ":")); return progress_bar($length, @vector); } my $min = make_numeric(parse_macro($arg2, $env)); my $arg3 = shift; my $mid = make_numeric(parse_macro($arg3, $env)); my $arg4 = shift; my $max = make_numeric(parse_macro($arg4, $env)); return progress_bar($length, $min, $mid, $max); } if (/^errno[-_]?text$/) { my $code = parse_macro($arg1, $env); return "" unless defined($code) && $code != 0; $code = -$code if $code < 0; return __conv_errno("", $code); } if (/^get[-_]?log[-_]?status/) { return get_error_text($$env{"cmd"}, $$env{"res"}); } if (/^get[-_]?resource[-_]?(fat|err|wrn)([-_]?count)?/) { my $what = $1; my $do_count = $2; my %assoc = ("fat" => 4, "err" => 3, "wrn" => 2); my $glob = $$env{"resdir"} . "/$assoc{$what}.*.status"; return _get_text($glob, undef, 0, $do_count); } if (/^warn/) { my $txt = parse_macro($arg1, $env); lwarn $txt; return ""; } if (/^die$/) { my $txt = parse_macro($arg1, $env); ldie $txt; return ""; } if (/^is-module-loaded$/) { my $path = "/proc/sys/mars"; my $result = -d $path; return defined($result) && $result; } if (/^is-(member|guest)$/) { my $type = $1; $arg1 = parse_macro($arg1, $env); $arg1 = $$env{"res"} unless $arg1; my $arg2 = shift; $arg2 = parse_macro($arg2, $env); $arg2 = $$env{"host"} unless $arg2; my $result; if ($type eq "guest") { $result = is_guest($arg1, $arg2); } else { $result = is_member($arg1, $arg2); } return $result ? 1 : 0; } # list objects if (/^(count[-_]?)?(cluster|resource|guest)[-_]?members$/) { my $old = $_; $_ =~ s/members/peers/; lwarn "deprecated: please use macro '$_' instead of '$old'\n"; } if (/^(count[-_]?)?(cluster|resource|guest)[-_]?peers$/) { my $do_count = $1; my $type = $2; my @peers; if ($type eq "cluster") { @peers = get_total_peers(); } elsif ($type eq "guest") { @peers = get_guest_peers($$env{"res"}); } else { @peers = get_member_peers($$env{"res"}); } if (defined($do_count)) { my $result = scalar(@peers); if ($ignore_deleted_peers && %is_deleted_peer) { $result = 0; foreach my $this_peer (@peers) { next if $is_deleted_peer{$this_peer}; $result++; } } return $result; } my $list = ""; foreach my $peer (@peers) { $list .= "$peer\n"; } return $list; } if (/^(count[-_]?)?(my|all)[-_]?(resources|members|guests)$/) { my $do_count = $1; my $what = $2; my $type = $3; my $peer = ""; if ($what eq "my") { $peer = parse_macro($arg1, $env); $peer = $$env{"host"} unless $peer; } my @list; if ($type eq "guests") { @list = get_guest_resources($peer); } elsif ($type eq "members") { @list = get_member_resources($peer); } else { @list = get_total_resources($peer); } return scalar(@list) if defined($do_count); my $list = ""; foreach my $item (@list) { $list .= "$item\n"; } return $list; } # low-level symlink access if (/^readlink$/) { my $path = parse_macro($arg1, $env); return get_link($path, 1); } if (/^setlink$/) { my $src = parse_macro($arg1, $env); my $arg2 = shift; my $dst = parse_macro($arg2, $env); set_link($src, $dst); return ""; } # high-level state access if (/^(get|todo|actual)[-_]?primary$/) { my $op = $1; my $primary; if ($op eq "actual") { lwarn "DEPRECATED: you are trying to uniquely identify an actual primary hostname, but this is conceptually wrong because in split-brain situations there may exist multiple ones. Use view-is-primary instead. That would be safe.\n" unless $$env{cmd} =~ m/-1and1$/; $primary = _get_actual_primary($$env{"res"}); } else { $primary = _get_designated_primary($$env{"res"}); } $primary = "" if (!defined($primary) || $primary eq "(none)"); $primary = ($primary eq $$env{"host"}) if $op eq "todo"; return $primary; } if (/^todo[-_]?secondary$/) { my $val = eval_fn($env, "get-primary", $arg1); return $val eq "(none)" ? 1 : 0; } if (/^todo[-_]?(attach|sync|fetch|replay)?$/) { my $what = $1; $what = parse_macro($arg1, $env) unless defined($what); my $lnk = $$env{"resdir"} . "/todo-" . $$env{"host"} . "/$what"; $lnk = correct_path($lnk); return get_link($lnk, 1); } if (/^get[-_]?msg$/) { my $what = parse_macro($arg1, $env); my $lnk = $$env{"resdir"} . "/actual-" . $$env{"host"} . "/msg-$what"; return get_link($lnk, 1); } if (/^(all|the)[-_](pretty[-_]?)?(global[-_]?)?((?:err|wrn|inf)[-_])?(msg|count)$/) { my $shorten = ($1 eq "the"); my $pretty = $2; my $global = $3; my $specific = $4; my $type = $5; $specific = "" unless defined($specific); my $glob = "$mars"; $glob = $$env{"resdir"} if (!defined($global) && $$env{"res"}); $glob .= "/actual-" . $$env{"host"} . "/msg-$specific*"; my $result = ""; my $count = 0; foreach my $msg_path (lamport_glob($glob)) { my $val = get_link($msg_path, 1); if ($shorten) { # skip uninteresting messages next if $val =~ m/^OK/; # skip _transient_ error messages if ($msg_path =~ m/-err-/ && $val =~ m/^([0-9]+\.[0-9]{9})/) { my $stamp = $1; if ($stamp) { my $delta = $timeout > 0 ? $timeout : 30; next if $stamp + $delta > time(); } } } $val = _replace_timestamps($val, $shorten) if defined($pretty); my $key = $msg_path; $key =~ s:^.*/msg-::; $result .= "$key: $val\n"; $count++; } return $count if $type eq "count"; return $result; } if (/^potential[-_]?features$/) { my $all_flags = ~0x0; my $result = featuresflags2txt($all_flags, 1); return $result; } if (/^(implemented|usable)[-_]?(features|compressions|digests)$/) { my $op = $1; my $restrict = $2; $op = "features" if $op eq "implemented"; my $str = get_alive_link($op, $$env{"host"}, 2); my $flags; if ($str =~ m/,(0x[0-9a-f]*)/) { $flags = $1; } $flags = 0x0 unless (defined($flags) && $flags ne ""); $flags = hex($flags) if $flags =~ m/^0x/; if ($restrict =~ m/compressions/) { $flags &= $compress_features; } elsif ($restrict =~ m/digests/) { $flags &= $chksum_features; } my $result = featuresflags2txt($flags, 1); return $result; } if (/^(enabled-(log|net)-compressions|disabled-(log|net)-digests)$/) { my $op = $1; my $path = "$mars/defaults/$op"; my $flags = get_link($path, 2); $flags = 0x0 unless (defined($flags) && $flags ne ""); $flags = hex($flags) if $flags =~ m/^0x/; #NYI #$path = "$mars/resource-" . $$env{"res"} . "/defaults/$op"; #my $res_flags = get_link($path, 2); #if (defined($res_flags) && $res_flags ne "") { #$res_flags = hex($res_flags) if $res_flags =~ m/^0x/; #if ($op =~ m/enabled/) { #$flags |= $res_flags; #} else { #$flags = $res_flags; #} #} if ($op =~ m/compressions/) { $flags &= $compress_features; } elsif ($op =~ m/digests/) { $flags &= $chksum_features; } my $unusable = $flags & ~$kernel_flags_version; if ($unusable > 0) { my $txt = featuresflags2txt($unusable); lwarn "features '$txt' are not compiled into the kernel module\n"; } my $result = featuresflags2txt($flags); return $result; } if (/^used-((log|net)-(compression|digest))$/) { my $op = $1; my $flags = get_alive_link("used-$op", $$env{"host"}, 2); $flags = 0x0 unless (defined($flags) && $flags ne ""); $flags = hex($flags) if $flags =~ m/^0x/; if ($op =~ m/compression/) { $flags &= $compress_features; } elsif ($op =~ m/digest/) { $flags &= $chksum_features; } my $result = featuresflags2txt($flags); return $result; } if (/^(tree|features|)[-_]?version$/) { my $op = $1; my $result = get_alive_link($op, $$env{"host"}, 2); if ($result =~ m/^([^,]*,[^,]*,)(0x[0-9a-z]+),(0x[0-9a-z]+)/p) { my $res1 = $1; my $res2 = $2; my $res3 = $3; my $rest = $POSTMATCH; $result = $res1 . featuresflags2txt($res2) . "," . featuresflags2txt($res3) . $rest; } return $result; } if (/^is[-_]?alive$/) { my $peer = parse_macro($arg1, $env); $peer = _get_designated_primary($$env{"res"}) unless $peer; $peer = $$env{"host"} unless $peer; my $stamp = get_alive_link("time", $peer, 1); return is_recent($stamp, $$env{"window"}); } if (/^alive[-_]?timestamp$/) { my $peer = parse_macro($arg1, $env); $peer = _get_designated_primary($$env{"res"}) unless $peer; return -1 if !$peer || $peer eq "(none)"; my $result = get_alive_link("time", $peer, 2); return $result if $result; return get_alive_stamp("alive", $peer); } if (/^is[-_]?orphan$/) { my $peer = parse_macro($arg1, $env); $peer = $$env{"host"} unless $peer; return 0 if eval_fn($env, "is-primary", $peer); my $replay = get_link($$env{"resdir"} . "/replay-$peer", 1); my $matches = $replay =~ m/^(log-[^,]+),([0-9]*)/; return 1 unless $matches; my $logfile = $$env{"resdir"} . "/" . $1; my $logpos = $2; if (! -r $logfile) { return 1; } my @stat = stat($logfile); if (!@stat) { return 1; } my $size= $stat[7]; if ($size < $logpos) { return 1; } return 0; } if (/^is[-_]?(almost[-_]?)?consistent$/) { my $almost = $1; # has sync finished? my $syncrest = make_numeric(eval_fn($env, "sync-rest", "")); return 0 if $syncrest > 0; if (!$almost && eval_fn($env, "is-primary", "")) { # is the replay link indicating that something is not yet applied / dirty? my $replay = get_link($$env{"resdir"} . "/replay-" . $$env{"host"}, 1); $replay =~ m:,[0-9]+,([0-9]+):; my $rest = $1; return 0 if $rest > 0; } # are all logfiles applied which had accumulated during sync? my $replay = get_link($$env{"resdir"} . "/replay-" . $$env{"host"}, 1); return 0 unless $replay; my $syncpos = get_link($$env{"resdir"} . "/syncpos-" . $$env{"host"}, 2); if (defined($syncpos) && $syncpos) { my $cmp = compare_replaylinks($syncpos, $replay); return 0 if $cmp > 0; } return 1; } if (/^get[-_]?disk$/) { my $lnk = $$env{"resdir"} . "/data-" . $$env{"host"}; my $result = get_link($lnk, 1); $result = "" unless defined($result); return $result; } if (/^get[-_]?device$/) { my $result = device_name($$env{"res"}, $$env{"host"}); return $result; } if (/^disk[-_]?error$/) { my $lnk = $$env{"resdir"} . "/actual-" . $$env{"host"} . "/disk-error"; my $result = get_link($lnk, 2); return $result; } if (/^disk[-_]?present$/) { my $lnk = $$env{"resdir"} . "/data-" . $$env{"host"}; my $result = get_link($lnk, 1); $result = "" unless defined($result); if ($result) { # TODO: make this remotely retrievable via status link $result = -b $result; $result = "0" unless defined($result); } return $result; } if (/^device[-_]?present$/) { my $result = device_exists($$env{"res"}, $$env{"host"}); return $result; } # deprecated (irregular names) if (/^present[-_]?(disk|device)$/) { my $what = $1; return eval_fn($env, "$what-present", $arg1); } if (/^(device)[-_]?(opened|nrflying|error|completion-stamp|completion-age)$/) { my $what = $1; my $op = $2; my $peer = $$env{"host"}; my %transl = ( "opened" => "open-count", "nrflying" => "if-flying", "error" => "if-state", "completion-stamp" => "if-completion-stamp", "completion-age" => "if-completion-stamp", ); if ($what eq "device") { my $other = eval_fn($env, "get-device", $arg1); if ($other =~ m/\@(.*)/) { $peer = $1; } } my $lnk = $$env{"resdir"} . "/actual-$peer/" . $transl{$op}; my $result = get_link($lnk, 2); return 0 unless $result; if ($op =~ m/-age/) { $result = mars_time() - $result; $result = 0 if $result < 0; } return $result; } if (/^is[-_]?split([-_]?brain)?$/) { my $split = detect_splitbrain($$env{"res"}, 0); return $split ? 0 : 1; } if (/^is[-_]?(attach|sync|fetch|replay|primary|secondary|emergency)$/) { my $what = $1; my $is = "is"; $is = "has" if $what eq "emergency"; my $peer = parse_macro($arg1, $env); $peer = $$env{"host"} unless $peer; my $lnk = $$env{"resdir"} . "/actual-$peer/$is-$what"; $lnk = correct_path($lnk); my $val = get_link($lnk, 1); $val = $val ? 0 : 1 if $what eq "secondary"; return $val; } if (/^nr[-_]?(attach|sync|fetch|replay|primary|secondary)$/) { my $what = $1; my $is = "is"; $is = "has" if $what eq "emergency"; my $nr = 0; foreach my $peer (get_member_peers($$env{"res"})) { my $lnk = $$env{"resdir"} . "/actual-$peer/$is-$what"; $lnk = correct_path($lnk); my $val = get_link($lnk, 1); $val = !$val if $what eq "secondary"; $nr++ if $val; } return $nr; } if (/^does$/) { my $what = parse_macro($arg1, $env); my $is = "is"; $is = "has" if $what eq "emergency"; my $lnk = $$env{"resdir"} . "/actual-" . $$env{"host"} . "/$is-$what"; $lnk = correct_path($lnk); return get_link($lnk, 1); } if (/^(tree|rest-space)$/) { my $what = $1; return get_alive_link($what, $$env{"host"}, 1) } if (/^systemd[-_]?unit$/) { return _get_systemd_unit($$env{"cmd"}, $$env{"res"}); } if (/^(uuid)$/) { my $what = $1; my $lnk = "$mars/$what"; $lnk = correct_path($lnk); return get_link($lnk, 1); } if (/^replay[-_]?code$/) { my $lnk = $$env{"resdir"} . "/actual-" . $$env{"host"} . "/replay-code"; return get_link($lnk, 2); } if (/^(device|sync|fetch|replay|work)[-_]?(rate|ops[-_]?rate|amount[-_]?rate|remain)$/) { my $what = $1; my $select = $2; if ($what eq "work") { my $val1 = eval_fn($env, "fetch-$select", ""); my $val2 = eval_fn($env, "replay-$select", ""); return "" if (!defined($val1) || $val1 eq ""); return "" if (!defined($val2) || $val2 eq ""); $val1 = make_numeric($val1); $val2 = make_numeric($val2); return $val1 + $val2 if $select eq "remain"; # take the maximum rate return $val1 if $val1 > $val2; return $val2; } my $name = $select; $name = "amount_rate" if ($name eq "rate" || $name eq "remain"); $name =~ s/-/_/; my %names = ( "device" => "if", "sync" => "sync", "fetch" => "file", "replay" => "replay", ); $name =~ s/_/-$names{$what}_/; my $lnk = $$env{"resdir"} . "/actual-" . $$env{"host"} . "/$name"; my $rate = get_link($lnk, 2); # deprecated: compatibility with old version if (!defined($rate) || $rate eq "") { $lnk =~ s:/amount-:/:; $rate = get_link($lnk, 2); } if ($select eq "remain") { my $rest = make_numeric(eval_fn($env, "$what-rest", "")); return 0 if $rest <= 0; return -1 if (!defined($rate) || $rate eq "" || $rate <= 0); return $rest / 1024 / $rate; } if ($select =~ /rate/) { return 0 if (!defined($rate) || !$rate || $rate <= 0); return $rate if $select =~ /^ops-/; return $rate * 1024; } ldie "unknown macro $_\n"; } if (/^sync[-_]?size$/) { my $lnk = $$env{"resdir"} . "/size"; return get_link($lnk, 1); } if (/^sync[-_]?pos$/) { my $lnk = $$env{"resdir"} . "/syncstatus-" . $$env{"host"}; return get_link($lnk, 1); } if (/^(replay)[-_]?(lognr|basenr)$/) { my $op = $2; my ($sum, $firstpos, $logpos, $level) = get_amount($$env{"resdir"}, $$env{"host"}, -1, 1); return $firstpos if $op eq "lognr"; return $logpos; } if (/^(fetch)[-_]?(lognr)$/) { my ($sum, $firstpos, $logpos, $level) = get_amount($$env{"resdir"}, $$env{"host"}, 1, 1); return $logpos; } if (/^(work)[-_]?(lognr)$/) { my ($sum, $firstpos, $logpos, $level) = get_amount($$env{"resdir"}, $$env{"host"}, 1, 0); return $logpos; } if (/^replay[-_]?logcount$/) { return eval_fn($env, "replay-lognr", "") - eval_fn($env, "replay-basenr", ""); } if (/^(fetch|work)[-_]?logcount$/) { my $what = $1; return eval_fn($env, "$what-lognr", "") - eval_fn($env, "replay-lognr", ""); } if (/^writeback[-_]?rest$/) { my ($sum, $firstpos, $logpos, $level) = get_amount($$env{"resdir"}, $$env{"host"}, 1, 0); return $sum; } if (/^(replay|work)[-_]?(pos)$/) { my $what = $1; my $op = $2; my ($sum, $firstpos, $logpos, $level) = get_amount($$env{"resdir"}, $$env{"host"}, -1, 0); return $sum; } if (/^(replay[-_]?size)|(fetch[-_]?pos)$/) { my ($sum0) = get_amount($$env{"resdir"}, $$env{"host"}, -1, 0); my ($sum, $firstpos, $logpos, $level) = get_amount($$env{"resdir"}, $$env{"host"}, 1, 1); return $sum0 + $sum; } if (/^(fetch|work)[-_]?(size)$/) { my $what = $1; my ($sum0) = get_amount($$env{"resdir"}, $$env{"host"}, -1, 0); my ($sum, $firstpos, $logpos, $level) = get_amount($$env{"resdir"}, $$env{"host"}, 1, 0); return $sum0 + $sum; } if (/^(sync|fetch|replay|work)[-_]?(rest|(?:almost[-_]?|threshold[-_]?)?reached|percent|permille|vector)$/) { my $what = $1; my $op = $2; my $size = make_numeric(eval_fn($env, "$what-size", "")); my $pos = make_numeric(eval_fn($env, "$what-pos", "")); my $result = 0; if ($op eq "rest") { $result = $size - $pos if $pos < $size; } elsif ($op =~ m/^almost/) { my $limit = make_numeric(parse_macro($arg1, $env)) if $arg1 ne ""; $limit = 990 if $limit <= 0; $result = 1 if int($pos / $limit) >= int($size / 1000); } elsif ($op =~ m/^threshold/) { my $my_threshold = make_numeric($$env{"threshold"}); $arg1 = parse_macro($arg1, $env); $my_threshold = make_numeric(get_size($arg1)) if $arg1 ne ""; $result = 1 if $pos + $my_threshold >= $size; } elsif ($op eq "reached") { $result = 1 if $pos >= $size; } elsif ($op eq "percent") { $result = 100; $result = 100.5 * $pos / $size if $size > 0; $result = 100 if $result >= 100.0; } elsif ($op eq "permille") { $result = 1000; $result = 1000.5 * $pos / $size if $size > 0; $result = 1000 if $result >= 1000.0; } elsif ($op eq "vector") { my $delim = parse_macro($arg1, $env); $delim = ":" unless $delim; $result = "$pos$delim$size"; } else { ldie "unknown operation '$op'\n"; } return $result; } if (/^summary[-_]?vector$/) { my $pos1 = make_numeric(eval_fn($env, "replay-pos", "")); my $pos2 = make_numeric(eval_fn($env, "fetch-pos", "")); my $size = make_numeric(eval_fn($env, "fetch-size", "")); my $delim = parse_macro($arg1, $env); $delim = ":" unless $delim; return "$pos1$delim$pos2$delim$size"; } if (/^deletable[-_]?size$/) { my ($min, $max) = _get_deletable_logfiles($_, $$env{"res"}); my $sum = 0; foreach my $path (lamport_glob("$mars/resource-" . $$env{"res"} . "/log-*")) { $path =~ m/\/log-([0-9]+)-/; my $nr = $1; next if $nr < $min or $nr >= $max; my @stat = stat($path); $sum += $stat[7]; } return $sum; } if (/^occupied[-_]?size$/) { my $sum = 0; foreach my $path (lamport_glob("$mars/resource-" . $$env{"res"} . "/log-*")) { my @stat = stat($path); $sum += $stat[7]; } return $sum; } if (/^(disk|resource|device)[-_]?size$/) { my $what = $1; my $path = "$mars/resource-" . $$env{"res"}; if ($what eq "device" && eval_fn($env, "device-present", "")) { return _get_mars_size($$env{"cmd"}, $$env{"res"}); } elsif ($what eq "disk"){ my $peer = parse_macro($arg1, $env); $peer = $$env{"host"} unless $peer; $path .= "/actsize-$peer"; } else { $path .= "/size"; } return get_link($path, 1); } if (/^resource[-_]?possible[-_]?size$/) { return get_possible_size($$env{"cmd"}, $$env{"res"}); } # time handling and pausing if (/^time$/) { return mars_time(); } if (/^real[-_]?time$/) { my $time = time(); return $time; } if (/^replay[-_]?timestamp$/) { $arg1 = parse_macro($arg1, $env); $arg1 = $$env{"host"} unless $arg1; my $replay = $$env{"resdir"} . "/replay-$arg1"; return get_link_stamp($replay); } if (/^fetch[-_]?timestamp$/) { $arg1 = parse_macro($arg1, $env); $arg1 = $$env{"host"} unless $arg1; my $fetch_glob = $$env{"resdir"} . "/version-*-$arg1"; my @paths = sort alphanum_cmp lamport_glob($fetch_glob); return -1 if !@paths; my $fetch = pop(@paths); return get_link_stamp($fetch); } if (/^work[-_]?timestamp$/) { my $time1 = make_numeric(eval_fn($env, "fetch-timestamp", $arg1)); my $time2 = make_numeric(eval_fn($env, "replay-timestamp", $arg1)); # use the "best" of both... return $time1 if $time1 > $time2; return $time2; } if (/^(fetch|replay|work|alive)[-_]?age$/) { my $what = $1; my $time = make_numeric(eval_fn($env, "$what-timestamp", $arg1)); return -1 if $time <= 0; return int(mars_time()) - $time; } if (/^(fetch|replay|work|alive)[-_]?lag$/) { my $what = $1; $arg1 = $$env{"host"} unless $arg1; my $arg2 = shift; my $primary = parse_macro($arg2, $env); if (!$primary) { $primary = _get_designated_primary($$env{"res"}); } return -1 if !$primary || $primary eq "(none)"; my $time1 = make_numeric(eval_fn($env, "$what-timestamp", $arg1)); my $time2 = make_numeric(eval_fn($env, "$what-timestamp", $primary)); return -1 if ($time1 <= 0 || $time2 <= 0); return 0 if $time1 < $time2; return $time1 - $time2; } if (/^sleep$/) { my $time = parse_macro($arg1, $env); sleep($time); return ""; } if (/^timeout$/) { my $time = parse_macro($arg1, $env); sleep_timeout($time); return ""; } if (/^wait[-_]?((?:todo|is)[-_](?:attach|sync|fetch|replay|primary|secondary)[-_](?:on|off))$/) { my $specific = $1; $specific =~ s/_/-/g; wait_cond($$env{"cmd"}, $$env{"res"}, $specific); return ""; } if (/^wait(?:[-_]?resource)?$/) { my $specific = parse_macro($arg1, $env); wait_cond($$env{"cmd"}, $$env{"res"}, $specific); return ""; } if (/^wait[-_]?cluster$/) { my $specific = parse_macro($arg1, $env); wait_cluster($$env{"cmd"}, $$env{"res"}, $specific); return ""; } # generic flow control and loops if (/^(get|foreach)[-_]?glob$/) { my $op = $1; my $paths = parse_macro($arg1, $env); my $arg2 = shift; my $varname = parse_macro($arg2, $env); my $arg3 = shift; my @list = lamport_glob($paths); my $result = ""; if ($op eq "get") { my $delim = parse_macro($arg3, $env); foreach my $path (@list) { $result .= $delim if $result; $result .= $path; } } else { # foreach foreach my $path (@list) { $$env{$varname} = $path; $result .= parse_macro($arg3, $env); } } return $result; } if (/^(if|unless)$/) { my $op = $1; my $cond = parse_macro($arg1, $env); $cond = !$cond if $op eq "unless"; my $arg2 = shift; if ($cond) { ldie "undefined $op-part\n" unless defined($arg2); return parse_macro($arg2, $env); } elsif (defined(my $arg3 = shift)) { return parse_macro($arg3, $env); } return ""; } if (/^else?(if|unless)$/) { my $op = $1; unshift @_, $arg1; while (defined(my $arg1 = shift)) { if (defined(my $arg2 = shift)) { my $cond = parse_macro($arg1, $env); $cond = !$cond if $op eq "unless"; if ($cond) { return parse_macro($arg2, $env); } } else { # odd number of arguments is treated as final "else" return parse_macro($arg1, $env); } } return ""; } if (/^while$/) { my $arg2 = shift; my $result = ""; while (parse_macro($arg1, $env)) { $result .= parse_macro($arg2, $env); next if _control_macro($env, "__next__"); last if _control_macro($env, "__last__"); } return $result; } if (/^until$/) { my $arg2 = shift; my $result = ""; until (parse_macro($arg1, $env)) { $result .= parse_macro($arg2, $env); next if _control_macro($env, "__next__"); last if _control_macro($env, "__last__"); } return $result; } if (/^for$/) { my ($arg2, $arg3, $arg4) = (shift, shift, shift); my $result = ""; for (parse_macro($arg1, $env); parse_macro($arg2, $env); parse_macro($arg3, $env)) { $result .= parse_macro($arg4, $env); next if _control_macro($env, "__next__"); last if _control_macro($env, "__last__"); } return $result; } if (/^foreach$/) { my $varname = parse_macro($arg1, $env); my $arg2 = shift; my $txt = parse_macro($arg2, $env); my $arg3 = shift; my $delim = parse_macro($arg3, $env); my $arg4 = shift; my $result = ""; foreach my $value (split($delim, $txt)) { $$env{$varname} = $value; $result .= parse_macro($arg4, $env); next if _control_macro($env, "__next__"); last if _control_macro($env, "__last__"); } return $result; } if (/^protect$/) { # don't evaluate argument, take verbatim return $arg1; } if (/^eval$/) { # evaluate given number of times my $count = parse_macro($arg1, $env); my $arg2 = shift; while ($count-- > 0) { $arg2 = parse_macro($arg2, $env); } return $arg2; } if (/^eval[-_]?down$/) { # evaluate until result is stable for (;;) { my $old = $arg1; $arg1 = parse_macro($arg1, $env); last if $arg1 eq $old; } return $arg1; } if (/^tmp$/) { # evaluate once in a temporary scope my %copy_env = %$env; my $result = parse_macro($arg1, \%copy_env); return $result; } if (/^(include|call)$/) { my $op = $1; my $name = parse_macro($arg1, $env); my $txt = get_macro($name); if ($op eq "call") { # run in new sub-scope my %copy_env = %$env; set_args($env, \%copy_env, $name, @_); return parse_macro($txt, \%copy_env); } # 'include' runs in the same scope set_args($env, $env, $name, @_); return parse_macro($txt, $env); } if (/^callstack$/) { return $$env{"callstack"}; } if (/^(abort|return|stop-eval)$/) { my $op = $1; $$env{$op} = 1; return ""; } if (/^(next|last)$/) { my $op = $1; $$env{$op} = 1; $$env{"__return__"} = 1; return ""; } ldie "call to unknown macro '$fn'\n"; } } ################################################################## # macro parsing sub _control_macro { my $env = shift; my $control = shift; my $result = $$env{$control}; $$env{$control} = 0; return $result; } sub parse_macro { my ($text, $env) = @_; $text = "" unless defined($text); my $old_callstack = $$env{"callstack"}; my $result = ""; while ($text =~ m/\\(.)|$match_fn/sp) { my $escape = $1; my $fn = $2; my @args = ($3); my $pre = $PREMATCH; my $post = $POSTMATCH; if (defined($escape)) { $result .= $pre; $text = $post; $_ = $escape; if (/[tnrfbae]/) { eval "\$result .= \"\\$escape\""; next; } if (/[a-zA-Z]/) { lwarn "control sequence '\\$escape' is reserved for future use\n"; } $result .= $escape; next; } return "" if _control_macro($env, "__abort__"); return $result if _control_macro($env, "__return__"); return $result . $text if _control_macro($env, "__stop_eval__"); $result .= $pre; $text = $post; while ($text =~ m/\A\{($match_inner)\}/sp ) { push @args, $1; $text = $POSTMATCH; } my $new = eval_fn($env, $fn, (@args)); ldie "undefined result from evaluation of primitive macro '$fn'\n" unless defined($new); $$env{"callstack"} = $old_callstack; $result .= $new; } return "" if _control_macro($env, "__abort__"); return $result if _control_macro($env, "__return__"); return $result . $text if _control_macro($env, "__stop_eval__"); return $result . $text; } sub make_env { my ($cmd, $res, $text) = (shift, shift, shift); $text =~ s{$match_comment}{}sg; my $ip = _get_ip($host); my %start_env = ( "cmd" => $cmd, "res" => $res, "resdir" => "$mars/resource-$res", "mars" => $mars, "host" => $host, "real_host" => $real_host, "ip" => $ip, "timeout" => $timeout, "threshold" => $threshold, "window" => $window, "force" => $force, "dry-run" => $dry_run, "verbose" => $verbose, "callstack" => "", # internal, deliberately not documented "__abort__" => 0, "__return__" => 0, "__stop_eval__" => 0, "__next__" => 0, "__last__" => 0, ); set_args(\%start_env, \%start_env, $cmd, @_); return ($text, \%start_env); } sub eval_macro { my ($text, $start_env) = make_env(@_); return parse_macro($text, $start_env); } ################################################################## # macro commands my $macro = ""; my %complex_macros = ( "default" => "" . "%elsif{%is-member{}}{" . "%call{device-info}" . " %{res} [%count-resource-peers{%{res}}/%count-cluster-peers{}]" . " %include{diskstate} %include{replstate} %include{flags} %include{role} %include{primarynode} %include{commstate}\n" . "%if{%>{%-{%disk-size{}}{%resource-size{}}}{%{threshold}}}{" . " Hint: you are wasting %human-numbers{}{ }{ }{%-{%disk-size{}}{%resource-size{}}} on disk %get-disk{}\n" . "}" . "%if{%>{%resource-possible-size{}}{%resource-size{}}}{" . " Hint: marsadm resize would increase resource %{res} by %human-numbers{}{ }{ }{%-{%resource-possible-size{}}{%resource-size{}}}\n" . "}" . "%if{%and{%not{%is-primary{}}}{%is-attach{}}}{" . "%if{%not{%sync-reached{}}}{" . "%include{syncinfo}" . "}" . "%if{%not{%work-reached{}}}{" . "%include{replinfo}" . "}" . "}" . "%call{resource-errors}" . "}{" . "%call{device-info}" . "}", "default-resource" => "%if{%{res}}{" . "%{res} %human-numbers{}{ }{ }{%resource-size{}} " . "[%count-resource-peers{%{res}}/%count-cluster-peers{}]" . "}", "default-global" => "%call{comminfo}", "default-header" => "%if{%usable-compressions{}}{" . "used-log-compression=\"%used-log-compression{}\"\n" . "%if{%{verbose}}{" . " usable: \"%usable-compressions{}\"\n" . " enabled: \"%enabled-log-compressions{}\"\n" . "}" . "used-net-compression=\"%used-net-compression{}\"\n" . "%if{%{verbose}}{" . " usable: \"%usable-compressions{}\"\n" . " enabled: \"%enabled-net-compressions{}\"\n" . "}" . "}" . "%if{%usable-digests{}}{" . "used-log-digest=\"%used-log-digest{}\"\n" . "%if{%{verbose}}{" . " usable: \"%usable-digests{}\"\n" . " disabled: \"%disabled-log-digests{}\"\n" . "}" . "used-net-digest=\"%used-log-digest{}\"\n" . "%if{%{verbose}}{" . " usable: \"%usable-digests{}\"\n" . " disabled: \"%disabled-net-digests{}\"\n" . "}" . "}", "default-footer" => "", "1and1" => "%if{%{res}}{" . " %{res} %include{diskstate-1and1} %include{replstate-1and1} %include{flags-1and1} %include{role-1and1} %include{primarynode-1and1}\n" . "%if{%and{%is-attach{}}{%not{%sync-reached{}}}}{" . "%include{syncinfo-1and1}" . "}" . "%if{%and{%is-attach{}}{%not{%work-threshold-reached{}}}}{" . "%include{replinfo-1and1}" . "}" . "%call{resource-errors-1and1}" . "}{" . "%the-pretty-global-msg{}" . "}", "device-stats" => "" . "%if{%device-opened{}}{" . "Opened, " . "%if{%device-error{}}{" . "ERROR %device-error{} %errno-text{%device-error{}}, " . "}" . "%device-ops-rate{} IOPS" . "%if{%device-nrflying{}}{" . ", %device-nrflying{} Flying" . "%if{%>{%device-completion-age{}}{%{window}}}{" . ", HANGING age: %human-seconds{%device-completion-age{}}" . "}" . "}" . "}{" . "Closed" . "}", "device-info" => "%if{%device-present{}}{" . " LocalDevice %get-device{}" . "" . " [%call{device-stats}]\n" . "}", "diskstate" => "%if{%disk-error{}}{" . "DISK_ERROR %disk-error{} %errno-text{%disk-error{}} " . "}" . "%elsif{%not{%get-disk{}}}{" . "NotJoined" . "}{%not{%disk-present{}}}{" . "NotPresent" . "}{%not{%todo-attach{}}}{" . "%if{%is-attach{}}{" . "Detaching" . "}{" . "Detached" . "}" . "}{%and{%replay-code{}}{%<{%replay-code{}}{0}}}{" . "DefectiveLog[%errno-text{%replay-code{}}]" . "}{%is-orphan{}}{" . "Orphan" . "}{%not{%is-attach{}}}{" . "NoAttach" . "}{%not{%is-consistent{}}}{" . "%elsif{%not{%todo-primary{}}}{" . "InConsistent" . "}{%is-primary{}}{" . "WriteBack[%human-numbers{}{ }{ }{%writeback-rest{}}]" . "}{" . "Recovery" . "}" . "}{%not{%work-reached{}}}{" . "%elsif{%not{%todo-primary{}}}{" . "OutDated[%call{outdated-flags}]" . "}{%is-primary{}}{" . "WriteBack[%human-numbers{}{ }{ }{%writeback-rest{}}]" . "}{" . "Recovery" . "}" . "}{%and{%ne{%get-primary{}}{}}{%tmp{%let{host}{%get-primary{}}%is-emergency{}}}}{" . "EmergencyMode" . "}{" . "UpToDate" . "}", "diskstate-1and1" => "%if{%disk-present{}}{" . "%if{%does{attach}}{" . "%if{%is-almost-consistent{}}{" . "%if{%work-reached{}}{" . "Uptodate" . "}{" . "Outdated[%call{outdated-flags-1and1}]" . "}" . "}{Inconsistent}" . "}{Detached}" . "}{Detached}", "outdated-flags" => "%if{%fetch-reached{}}{}{F}%if{%replay-reached{}}{}{R}", "outdated-flags-1and1" => "%if{%fetch-reached{}}{}{F}%if{%replay-reached{}}{}{R}", "replstate" => "%elsif{%not{%is-module-loaded{}}}{" . "ModuleNotLoaded" . "}{%not{%is-alive{%{host}}}}{" . "UnResponsive" . "}{%not{%get-disk{}}}{" . "NotJoined" . "}{%not{%todo-attach{}}}{" . "NotStarted" . "}{%todo-primary{}}{" # I am designated primary . "%elsif{%is-emergency{}}{" . "EmergencyMode" . "}{%is-primary{}}{" . "Replicating" . "}{" . "NotYetPrimary" . "}" . "}{" # Secondary . "%elsif{%and{%sync-rest{}}{%not{%todo{sync}}}}{" . "PausedSync" . "}{%is-sync{}}{" . "Syncing" . "}{%not{%todo{fetch}}}{" . "PausedFetch" . "}{%not{%todo{replay}}}{" . "PausedReplay" . "}{%not{%get-primary{}}}{" . "NoPrimaryDesignated" . "}{%not{%is-alive{}}}{" . "PrimaryUnreachable" . "}{%is-orphan{}}{" . "Orphan" . "}{" . "Replaying" . "}" . "}", "replstate-1and1" => "%if{%disk-present{}}{" . "%if{%is-primary{}}{" . "Replicating" . "}{" . "%if{%is-alive{}}{" . "%if{%and{%not{%sync-reached{}}}{%not{%todo{sync}}}}{" . "PausedSync" . "}{" . "%if{%does{sync}}{" . "Syncing" . "}{" . "%unless{%and{%todo-fetch{}}{%todo-replay{}}}{" . "PausedReplay" . "}{Replaying}" . "}" . "}" . "}{PrimaryUnreachable}" . "}" . "}{NotJoined}", "flags" => "%if{%disk-present{}}{%if{%device-present{}}{D}{d}}{-}" . "%if{%is-consistent{}}{C}{%if{%disk-present{}}{c}{-}}" . "%if{%does{attach}}{%if{%todo{attach}}{A}{a}}{%if{%todo{attach}}{a}{-}}" . "%if{%sync-reached{}}{S}{%if{%todo{sync}}{s}{-}}" . "%if{%fetch-reached{}}{F}{%if{%todo{fetch}}{f}{-}}" . "%if{%replay-reached{}}{R}{%if{%todo{replay}}{r}{-}}", "flags-1and1" => "-%if{%todo{sync}}{S}{-}%if{%todo{fetch}}{F}{-}%if{%todo{replay}}{R}{-}-", "todo-role" => "%if{%disk-present{}}{" . "%if{%todo-primary{}}{" . "Primary" . "}{" . "Secondary" . "}" . "}{None}", "role" => "%if{%disk-present{}}{" . "%if{%todo-primary{}}{" . "%if{%is-primary{}}{" . "%if{%>{%nr-primary{}}{1}}{" . "Forced" . "}" . "Primary" . "}{" . "NotYetPrimary" . "}" . "}{" . "%if{%is-primary{}}{" . "RemainsPrimary" . "}{" . "Secondary" . "}" . "}" . "}{None}", "role-1and1" => "%if{%disk-present{}}{" . "%if{%is-primary{}}{" . "Primary" . "}{" . "Secondary" . "}" . "}{Secondary}", "primarynode" => "%if{%todo-primary{}}{" . "%{host}" . "}{" . "%get-primary{}" . "}", "primarynode-1and1" => "%if{%disk-present{}}{" . "%if{%is-primary{}}{" . "%{host}" . "}{" . "%if{%actual-primary{}}{" . "%actual-primary{}" . "}{-}" . "}" . "}{-}", "commstate" => "%let{comm}{%alive-age{}}" . "%if{%>={%{comm}}{%{window}}}{" . "%human-seconds{%{comm}}" . "}", "syncinfo" => "%let{amount}{%human-numbers{}{ }{ }{%sync-pos{}}{%sync-size{}}}" . "%let{rate}{%human-numbers{}{ }{ }{%sync-rate{}}}" . "%sprintf{ syncing: %s %.2f%% (%d/%d)%s rate: %.2f %s/sec remaining: %s hrs\n}" . "{%progress{20}{%sync-pos{}}{0}{%sync-size{}}}" . "{%sync-percent{}}" . "{%{amount}{ }{0}}" . "{%{amount}{ }{1}}" . "{%{amount}{ }{2}}" . "{%{rate}{ }{0}}" . "{%{rate}{ }{1}}" . "{%human-seconds{%sync-remain{}}}" . "%call{sync-line}", "syncinfo-1and1" => "%let{amount}{%human-numbers{}{ }{ }{%sync-pos{}}{%sync-size{}}}" . "%let{rate}{%human-numbers{}{ }{ }{%sync-rate{}}}" . "%sprintf{ syncing: %s %.2f%% (%d/%d)%s rate: %.2f %s/sec remaining: %s hrs\n}" . "{%progress{20}{%sync-pos{}}{0}{%sync-size{}}}" . "{%sync-percent{}}" . "{%{amount}{ }{0}}" . "{%{amount}{ }{1}}" . "{%{amount}{ }{2}}" . "{%{rate}{ }{0}}" . "{%{rate}{ }{1}}" . "{%human-seconds{%sync-remain{}}}" . "%call{sync-line-1and1}", "replinfo" => "%let{amount}{%human-numbers{}{ }{ }{%replay-pos{}}{%fetch-size{}}}" . "%let{logs}{%replay-lognr{}}" . "%let{l2}{%fetch-lognr{}}" . "%let{l3}{%work-lognr{}}" . "%if{%>{%{l2}}{%{logs}}}{%append{logs}{::%{l2}}}" . "%if{%>{%{l3}}{%{l2}}}{%append{logs}{..%{l3}}}" . "%sprintf{ replaying: %s %.2f%% (%d/%d)%s logs: [%s]\n}" . "{%progress{20}{%replay-pos{}}{%fetch-pos{}}{%fetch-size{}}}" . "{%work-percent{}}" . "{%{amount}{ }{0}}" . "{%{amount}{ }{1}}" . "{%{amount}{ }{2}}" . "{%{logs}}" . "%call{fetch-line}" . "%call{replay-line}", "replinfo-1and1" => "%let{amount}{%human-numbers{}{ }{ }{%replay-pos{}}{%fetch-size{}}}" . "%sprintf{ replaying: %s %.2f%% (%d/%d)%s logs: [%d..%d]\n}" . "{%progress{20}{%replay-pos{}}{%fetch-pos{}}{%fetch-size{}}}" . "{%work-percent{}}" . "{%{amount}{ }{0}}" . "{%{amount}{ }{1}}" . "{%{amount}{ }{2}}" . "{%replay-lognr{}}" . "{%fetch-lognr{}}" . "%call{fetch-line-1and1}" . "%call{replay-line-1and1}", "sync-line" => "%let{amount}{%human-numbers{}{}{}{%sync-pos{}}{%sync-size{}}}" . "%let{rate}{%human-numbers{}{}{}{%sync-rate{}}}" . "%let{remain}{%human-seconds{%sync-remain{}}}" . " > sync: %{amount} rate: %{rate}/sec remaining: %{remain} hrs\n", "sync-line-1and1" => "%let{amount}{%human-numbers{}{}{}{%sync-pos{}}{%sync-size{}}}" . "%let{rate}{%human-numbers{}{}{}{%sync-rate{}}}" . "%let{remain}{%human-seconds{%sync-remain{}}}" . " > sync: %{amount} rate: %{rate}/sec remaining: %{remain} hrs\n", "fetch-line" => "%let{amount}{%human-numbers{}{}{}{%fetch-rest{}}}" . "%let{rate}{%human-numbers{}{}{}{%fetch-rate{}}}" . "%let{remain}{%human-seconds{%fetch-remain{}}}" . "%let{age}{%if{%and{%fetch-remain{}}{%>={%fetch-age{}}{%{window}}}}{ age: %human-seconds{%fetch-age{}}}}" . "%let{lag}{%if{%and{%fetch-remain{}}{%>={%fetch-lag{}}{%{window}}}}{ lag: %human-seconds{%fetch-lag{}}}}" . " > fetch: %{amount}%{age}%{lag} rate: %{rate}/s remaining: %{remain}\n", "fetch-line-1and1" => "%let{amount}{%human-numbers{}{}{}{%fetch-rest{}}}" . "%let{rate}{%human-numbers{}{}{}{%fetch-rate{}}}" . "%let{remain}{%human-seconds{%fetch-remain{}}}" . " > fetch: %{amount} rate: %{rate}/sec remaining: %{remain} hrs\n", "replay-line" => "%let{amount}{%human-numbers{}{}{}{%replay-rest{}}}" . "%let{rate}{%human-numbers{}{}{}{%replay-rate{}}}" . "%let{remain}{%human-seconds{%replay-remain{}}}" . "%let{age}{%if{%and{%replay-remain{}}{%>={%replay-age{}}{%{window}}}}{ age: %human-seconds{%replay-age{}}}}" . " > replay: %{amount}%{age} rate: %{rate}/s remaining: %{remain}\n", "replay-line-1and1" => "%let{amount}{%human-numbers{}{}{}{%replay-rest{}}}" . "%let{rate}{%human-numbers{}{}{}{%replay-rate{}}}" . "%let{remain}{%human-seconds{%replay-remain{}}}" . " > replay: %{amount} rate: %{rate}/sec remaining: %{remain} hrs\n", "resource-errors" => "%let{fat-count}{%get-resource-fat-count{}}" . "%if{%{fat-count}}{" . "FATALS FILE (%{fat-count})" . "%if{%{verbose}}{" . ":\n%get-resource-fat{}" . "}{" . ": available with --verbose\n" . "}" . "}" . "%let{errs}{%the-err-count{}}" . "%if{%{errs}}{" . "ERRORS LNK (%{errs}):\n" . "%the-pretty-err-msg{}" . "}" . "%let{err-count}{%get-resource-err-count{}}" . "%if{%{err-count}}{" . "ERRORS FILE (%{err-count})" . "%if{%{verbose}}{" . ":\n%get-resource-err{}" . "}{" . ": available with --verbose\n" . "}" . "}" . "%let{wrns}{%the-wrn-count{}}" . "%if{%{wrns}}{" . "WARNINGS LNK (%{wrns}):\n" . "%the-pretty-wrn-msg{}" . "}" . "%let{wrn-count}{%get-resource-wrn-count{}}" . "%if{%and{%{verbose}}{%{wrn-count}}}{" . "WARNINGS FILE (%{wrn-count})" . "%if{%{verbose}}{" . ":\n%get-resource-wrn{}" . "}{" . ": available with --verbose\n" . "}" . "}" . "%let{infs}{%the-inf-count{}}" . "%if{%and{%>={%{verbose}}{2}}{%{infs}}}{" . "INFOS LNK (%{infs}):\n" . "%the-pretty-inf-msg{}" . "}" . "%let{status_msg}{%get-log-status-{}}" . "%if{%and{%>={%{verbose}}{2}}{%{status_msg}}}{" . "STATUS FILE:\n%{status_msg}" . "}", "resource-errors-1and1" => "%let{fat-count}{%get-resource-fat-count{}}" . "%if{%{fat-count}}{" . "FATALS FILE (%{fat-count}):\n" . "%get-resource-fat{}" . "}" . "%let{errs}{%the-err-count{}}" . "%if{%{errs}}{" . "ERRORS LNK (%{errs}):\n" . "%the-pretty-err-msg{}" . "}" . "%let{err-count}{%get-resource-err-count{}}" . "%if{%{err-count}}{" . "ERRORS FILE (%{err-count}):\n" . "%get-resource-err{}" . "}" . "%let{wrns}{%the-wrn-count{}}" . "%if{%{wrns}}{" . "WARNINGS LNK (%{wrns}):\n" . "%the-pretty-wrn-msg{}" . "}" . "%let{wrn-count}{%get-resource-wrn-count{}}" . "%if{%and{%{verbose}}{%{wrn-count}}}{" . "WARNINGS FILE (%{wrn-count})" . "%if{%{verbose}}{" . ":\n%get-resource-wrn{}" . "}{" . ": available with --verbose\n" . "}" . "}" . "%let{infs}{%the-inf-count{}}" . "%if{%and{%{verbose}}{%{infs}}}{" . "INFOS LNK (%{infs}):\n" . "%the-pretty-inf-msg{}" . "}" . "%let{status_msg}{%get-log-status-{}}" . "%if{%and{%{verbose}}{%{status_msg}}}{" . "STATUS FILE:\n%{status_msg}" . "}", "comminfo" => "%let{txt}{%the-pretty-global-msg{}}" . "%let{txt}{%subst{%{txt}}{^.*CONNECTED.*\$\n*}{}{mg}}" . "%if{%{txt}}{" . "++++++++++++ Global / Communication Status ++++++++++++\n" . "%{txt}" . "}" . "%let{timeslip}{%-{%time{}}{%real-time{}}}" . "%if{%>{%{timeslip}}{%{window}}}{" . "++++++++++++ Lamport time slip: %human-seconds{%{timeslip}} ++++++++++++\n" . "}", # drbd similar ones "state" => "NYI Please override macro \\%%{0}\\{\\}", "cstate" => "NYI Please override macro \\%%{0}\\{\\}", "dstate" => "NYI Please override macro \\%%{0}\\{\\}", "status" => "NYI Please override macro \\%%{0}\\{\\}", ); my %view_macros = %complex_macros; # add some trivial macros to the command line interface # FIXME: only at most one argument is allowed for now. my %trivial_globs = ( # intended for human use "{all,the}-{pretty-,}{global-,}{{err,wrn,inf}-,}msg" => "", "{is,todo,nr}-{attach,sync,fetch,replay,primary,secondary}" => "", "is-{split-brain,consistent,emergency,orphan}" => "", "is-{member,guest}" => "", "rest-space" => "", "get-{disk,device}" => "", "present-{disk,device}" => "(deprecated, use *-present instead)", "disk-error" => "", "{disk,device}-present" => "", "device-{opened,nrflying,error,completion-{stamp,age}}" => "", "get-log-status" => "", "get-resource-{fat,err,wrn}{,-count}" => "", # intended for scripting "{my,all}-{resources,members,guests}" => "", "count-{my,all}-{resources,members,guests}" => "", "{cluster,resource}-members" => "deprecated", "count-{cluster,resource,guest}-members" => "deprecated", "{cluster,resource,guest}-peers" => "", "count-{cluster,resource,guest}-peers" => "", "{disk,resource,device}-size" => "", "resource-possible-size" => "", "deletable-size" => "", "occupied-size" => "", "replay-code" => "When negative, this indidates that a replay/recovery error has occurred.", "errno-text" => "Convert errno numbers (positive or negative) into human readable text.", "{sync,fetch,replay,work,syncpos}-{size,pos}" => "", "{sync,fetch,replay,work}-{rest,{almost-,threshold-,}reached,percent,permille,vector}" => "", "writeback-rest" => "", "{sync,fetch,replay}-{ops-rate,amount-rate,rate,remain}" => "", "device-{ops-rate,amount-rate,rate}" => "", "replay-basenr" => "", "{fetch,replay,work}-{lognr,logcount}" => "", "summary-vector", => "", "{time,real-time}" => "", "{alive,fetch,replay,work}-{timestamp,age,lag}" => "", "{potential,implemented,usable}-features" => "", "{implemented,usable}-{digests,compressions}" => "", "enabled-{log|net}-compressions" => "", "disabled-{log|net}-digests" => "", "used-{log,net}-{digest,compression}" => "", "{tree,features}-version" => "", "{get,actual}-primary" => "", "is-{alive}" => "", "systemd-unit" => "", "uuid" => "", "tree" => "", "wait-{is,todo}-{attach,sync,fetch,replay,primary,secondary}-{on,off}" => "", ); my $glob = ""; foreach my $new_glob (sort alphanum_cmp keys(%trivial_globs)) { $glob .= "," if $glob; $glob .= $new_glob; } foreach my $name (lamport_glob("{$glob}")) { $view_macros{$name} = "\%primitive-$name\{\%\{1}}"; } sub _get_pre { my ($rest, $add) = @_; $rest =~ s{(\A.*\n)}{}sp; $add = 0 if $1; return length($rest) + $add; } sub _break_line { my ($result, $add, $indent) = @_; my $pre_len = _get_pre($result, $add); if ($pre_len != $indent) { $result .= "\\\n" . ' ' x $indent; } return $result; } sub _pretty_macro { my ($text, $add, $indent) = @_; $text =~ s/\\n/\n/gs; my $result = ""; # look for function calls while ($text =~ m/^($match_fn_head(?:{})*)/mp) { $result .= $PREMATCH; my $fn = $1; $text = $POSTMATCH; $result = _break_line($result, $add, $indent); $add = 0; $result .= $fn; while ($text =~ m/\A\{/sp) { # don't break simple / non-recursive / unbreakable arguments if ($text =~ m/\A(\{(?:\s$match_inner|[^%{}]*)\})/sp) { my $shortcut = $1; $text = $POSTMATCH; # make newlines non-verbatim $shortcut =~ s{\n}{\\n}spg; $result .= $shortcut; next; } # break more complex arguments #$result .= "{\\\n" . ' ' x $indent; $result .= "{"; if ($text =~ m/\A\{([\s]$match_inner|)\}/sp) { $result .= "$1}"; $text = $POSTMATCH; } elsif ($text =~ m/\A\{([^\s]$match_inner)\}/sp ) { my $arg = $1; $text = $POSTMATCH; my $sub_add = _get_pre($result, 0); my $subst = _pretty_macro($arg, $sub_add, $indent + 2); $result .= $subst; $result = _break_line($result, 0, $indent); $result .= "}"; } else { ldie "wtf '$text'?\n"; } } } $result .= $text; return $result; } sub pretty_macro { my $txt = _pretty_macro(@_); # always add a trailing newline (for vi ;) $txt .= "\\\n" unless $txt =~ m{\n\Z}sp; return $txt; } get_global_versions(); if (defined($ARGV[0]) && $ARGV[0] =~ m/^dump-(all-)?macros$/) { my %macros = %complex_macros; %macros = %view_macros if defined($1); foreach my $name (keys %macros) { my $txt = $macros{$name}; $txt = pretty_macro($txt, 0, 0); my $file = "$name.tpl"; if (-r $file) { # backup already existing files for (my $i = 0; ; $i++) { my $file_old = "$file.old$i"; unless (-r $file_old) { rename($file, $file_old); last; } } } open(OUT, ">", $file) or ldie "cannot create file '$file'\n"; print OUT $txt; close(OUT); } exit(0); } sub get_macro { my ($cmd, $tolerate) = @_; if ($macro) { my $orig = $macro; $macro = ""; # consume once return $orig; } foreach my $path (@MARS_PATH) { my $file = "$path/$cmd.tpl"; if (-r $file) { lprint_stderr "using macro file '$file'\n" if (defined($view_macros{$cmd})); local $/; # slurp open(IN, "<", $file) or next; my $tpl = ; close(IN); $tpl =~ s{$match_comment}{}sg; $tpl =~ s{\\n}{\n}sg; return $tpl; } } return $view_macros{$cmd} if (defined($view_macros{$cmd})); ldie "cannot find macro '$cmd'\n" unless defined($tolerate); return ""; } sub view_cmd { my ($cmd, $res) = (shift, shift); if ($cmd =~ s/^prettyprint-//) { my $txt = get_macro($cmd); print pretty_macro($txt, 0); return; } $cmd =~ s/^view-?//; $cmd = "default" unless $cmd; my $tpl = get_macro($cmd); my $result = eval_macro($cmd, $res, $tpl, @_); if ($result ne "") { # add trailing newline if none exists chomp $result; $result .= "\n"; } print $result; } ################################################################## # command table of all commands my %cmd_table = ( # new keywords "create-uuid" => [ "verbose 3", "usage: create-uuid (no parameters)", "Deprecated.", "This is only needed if you have a very old $mars/", "directory structure from MARS version light0.1beta05", "or earlier.", \&create_uuid, ], "create-cluster" => [ "usage: create-cluster (no parameters)", "This must be called exactly once when creating a new cluster.", "Don't call this again! Use join-cluster on the secondary nodes.", "Please read the PDF manual for details.", \&create_cluster, ], "join-cluster" => [ "usage: join-cluster ", "Establishes a new cluster membership.", "This must be called once on any new cluster member.", "This is a prerequisite for join-resource.", \&join_cluster, ], "merge-cluster" => [ "usage: merge-cluster []", "Precondition: the resource names of both clusters must be disjoint.", "Create the union of two clusters, consisting of the", "union of all machines, and the union of all resources.", "The members of each resource are _not_ changed by this.", "This is useful for creating a big \"virtual LVM cluster\" where", "resources can be almost arbitrarily migrated between machines via", "later join-resource / leave-resource operations.", \&merge_cluster, ], "merge-cluster-list" => [ "usage: merge-cluster-list", "Determine the local list of resources.", "Useful for checking or analysis of merge-cluster disjointness by hand.", \&merge_cluster, ], "merge-cluster-check" => [ "usage: merge-cluster-check ", "Check whether the resources of both clusters are disjoint.", "Useful for checking in advance whether merge-cluster would be", "possible.", \&merge_cluster, ], "split-cluster" => [ "usage: split-cluster (no parameters)", "DEPRECATED", "NOT OFFICIALLY SUPPORTED - ONLY FOR EXPERTS.", "RTFS = Read The Fucking Sourcecode.", "Use this only if you know what you are doing.", \&split_cluster, ], "leave-cluster" => [ "usage: leave-cluster (no parameters)", "This can be used for final deconstruction of a cluster member.", "Prior to this, all resources must have been left", "via leave-resource.", "Notice: this will never destroy the cluster UID on the /mars/", "filesystem.", "Please read the PDF manual for details.", \&leave_cluster, ], "create-resource" => [ "usage: create-resource ", "(further syntax variants are described in the PDF manual).", "Create a new resource out of a pre-existing disk (backing", "block device) /dev/lv/mydata (or similar).", "The current node will start in primary role, thus", "/dev/mars/ will appear after a short time, initially", "showing the same contents as the underlying disk /dev/lv/mydata.", "It is good practice to name the resource and the", "disk name identical.", \&create_res, ], "join-resource" => [ "usage: join-resource ", "(further syntax variants are described in the PDF manual).", "The resource must have been already created on", "another cluster node, and the network must be healthy.", "The contents of the local replica disk /dev/lv/mydata will be", "overwritten by the initial fast full sync from the currently", "designated primary node.", "After the initial full sync has finished, the current host will", "act in secondary role.", "For details on size constraints etc, refer to the PDF manual.", \&create_res, ], "leave-resource" => [ "Precondition: the local host must be in secondary role.", "Stop being a member of the resource, and thus stop all", "replication activities. The status of the underlying disk", "will remain in its current state (whatever it is).", \&leave_res_phase0, "check preconditions", \&leave_res_phase1, "switch state", \&leave_res_phase2, "purge logfiles", \&leave_res_phase3, "wait for deletions", \&leave_res_phase4, "trigger systemd", ], "delete-resource" => [ "CAUTION! This is dangerous when the network is somehow", "interrupted, or when damaged nodes are later re-surrected", "in any way.\n", "Precondition: the resource must no longer have any members", "(see leave-resource).", "This is only needed when you _insist_ on re-using a damaged", "resource for re-creating a new one with exactly the same", "old .", "HINT: best practice is to not use this, but just create a _new_", "resource with a new out of your local disks.", "Please read the PDF manual on potential consequences.", \&delete_res_phase1, "set links", \&delete_res_phase2, "trigger systemd", ], "set-systemd-unit" => [ "usage: set-systemd-unit []", "This activates the systemd template engine of marsadm.", "Please read mars-user-manual.pdf on this.", "When is omitted, it will be treated equal to", ".", "You may also use special keywords like DEFAULT, please read the manuals.", "PARALLEL", "FORK", \&set_systemd_unit_phase1, "set links", \&set_systemd_unit_phase2, "trigger systemd", ], "get-systemd-unit" => [ "usage: get-systemd-unit ", "Show the system units (for start and stop), or empty when unset.", \&get_systemd_unit, ], "set-systemd-want" => [ "usage: set-systemd-want ", "Override the current location where the complete systemd unit stack", "should be started.", "Useful for a _temporary_ stop of the systemd unit stack by supplying", "the special hostname \"(none)\".", "For a _permanent_ stop, use \"marsadm set-systemd-unit \"", "instead.", "PARALLEL", "FORK", \&set_systemd_want_phase1, "set links", "LOOP", \&set_systemd_want_phase2, "wait for effect", ], "get-systemd-want" => [ "usage: get-systemd-want ", "Show the current hostname where the complete systemd unit stack", "between start- and stop-unit should appear.", "Reports empty when unset, or \"(none)\" when stopped.", \&get_systemd_want, ], "set-connect-pref-list" => [ "verbose 2", "usage: set-connect-pref-list ", "Provisionary command for internal use at 1&1. Will be replaced by", "a better concept somewhen. The must be comma-separated.", \&set_connect_pref_list, ], "get-connect-pref-list" => [ "verbose 2", "usage: get-connect-pref-list ", "Provisionary command for internal use at 1&1. Will be replaced by", "a better concept somewhen.", "Shows the outcome of set-connect-pref-list.", \&set_connect_pref_list, ], "set-global-enabled-log-compressions" => [ "usage: set-global-enabled-log-compressions ", "Tell the whole cluster which compression features to use globally", "for logfile compression. The effective value can be checked via", "\"marsadm view-enabled-log-compressions\".", "See \"marsadm view-potential-features\" and", "\"marsadm --help\" for a list of compression feature names,", "which must be separated by | symbols.", \&set_defaults, ], "set-global-enabled-net-compressions" => [ "usage: set-global-enabled-net-compressions ", "Tell the whole cluster which compression features to use globally for", "network transport compression. This is independent from log compression.", "The effective value can be checked via", "\"marsadm view-enabled-log-compressions\".", "See \"marsadm view-potential-features\" and", "\"marsadm --help\" for a list of compression feature names,", "which must be separated by | symbols.", \&set_defaults, ], "set-global-disabled-log-digests" => [ "usage: set-global-disabled-log-digests ", "Tell the whole cluster which checksumming digests to disable globally", "for the payload in transaction logfiles.", "The effective value can be checked via \"marsadm view-disabled-log-digests\".", "See \"marsadm view-potential-features\" and", "\"marsadm --help\" for a list of digest feature names,", "which must be separated by | symbols.", \&set_defaults, ], "set-global-disabled-net-digests" => [ "usage: set-global-disabled-net-digests ", "Tell the whole cluster which checksumming digests to disable globally", "for cluster-wide data comparisons, like fast full-sync.", "The effective value can be checked via \"marsadm view-disabled-net-digests\".", "See \"marsadm view-potential-features\" and", "\"marsadm --help\" for a list of digest feature names,", "which must be separated by | symbols.", \&set_defaults, ], "set-disabled-digests" => [ "verbose 99", # NYI "usage: set-disabled-digests ", "Tell the whole cluster which checksumming digests to disable at the", "resource , potentially overriding the global value", "as set by set-global-disabled-digests.", "The effective value can be checked via \"marsadm view-disabled-digests\".", "See \"marsadm view-potential-features\" and", "\"marsadm --help\" for a list of digest feature names,", "which must be separated by | symbols.", \&set_defaults, ], "log-rotate" => [ "Only useful at the primary side.", "Start writing transaction logs into a new transaction logfile.", "This should be regularly called by a cron job or similar.", "For regular maintainance cron jobs, please prefer 'marsadm cron'.", "For details and best practices, please refer to the PDF manual.", \&logrotate_res, ], "log-delete-one" => [ "When possible, globally delete at most one old transaction logfile", "which is known to be superfluous, i.e. all secondaries no longer", "need to replay it.", "Hint: use this only for testing and manual inspection.", "For regular maintainance cron jobs, please prefer cron", "or log-delete-all.", \&logdelete_res, ], "log-delete" => [ "When possible, globally delete all old transaction logfiles which", "are known to be superflous, i.e. all secondaries no longer need", "to replay them.", "This must be regularly called by a cron job or similar, in order", "to prevent overflow of the /mars/ directory.", "For regular maintainance cron jobs, please prefer 'marsadm cron'.", "For details and best practices, please refer to the PDF manual.", \&logdelete_res, ], "log-delete-all" => [ "Alias for log-delete", \&logdelete_res, ], "cron" => [ "usage: cron (no parameters)", "Do all necessary regular housekeeping tasks.", "This is equivalent to log-rotate all; sleep 7; log-delete-all all.", [ \&link_purge_global, "purge global links", ], \&cron_phase1, "purge links and rotate logfiles", "SLEEP", \&cron_phase2, "delete old logfiles", \&cron_phase3, "autoclean gone peers (when enabled via --autoclean)", "CHANGE_RESOURCES", "ANY", [ \&cron_phase4, "autoclean gone resources (when enabled via --autoclean)", ], ], "log-purge-all" => [ "This is potentially dangerous.", "Use this only if you are really desperate in trying to resolve a", "split brain. Use this only after reading the PDF manual!", \&log_purge_res, ], "err-purge-all" => [ "Remove any err message from the given resources.", \&err_purge_res, ], "link-purge-all" => [ "Remove any .deleted links.", [ \&link_purge_global, "purge global links", ], \&link_purge_res, "purge per-resource links", ], "fake-sync" => [ "verbose 1", "Attention: this is potentially dangerous.", "Only for experts.", "Please read the PDF manual to understand the risks!", \&fake_sync_phase1, "switch sync off", \&fake_sync_phase2, "wait for sync off", \&fake_sync_phase3, "fake sync", ], "set-link" => [ "verbose 1", "usage: set-link ", "Only for experts.", \&set_link_cmd, ], "get-link" => [ "verbose 1", "usage: get-link ", "Only for experts.", \&set_link_cmd, ], "set-global-sync-limit-value" => [ "usage: set-sync-limit-value ", "Set the maximum number of resources which should by syncing", "concurrently.", \&set_sync_limit_value, ], "get-global-sync-limit-value" => [ "usage: get-sync-limit-value (no parameters)", "For retrieval of the value set by set-global-sync-limit-value.", \&set_sync_limit_value, ], "set-sync-limit-value" => [ "verbose 3", "Deprecated.", "Please use set-global-sync-limit-value instead.", \&set_sync_limit_value, ], "get-sync-limit-value" => [ "verbose 3", "Deprecated.", "Please use get-global-sync-limit-value instead.", \&set_sync_limit_value, ], "delete-file" => [ "verbose 1", "usage: delete-file ", "VERY dangerous!", "Only for experts.", \&delete_file_cmd, ], "set-emergency-limit" => [ "usage: set-emergency-limit ", "Set a per-resource emergency limit for disk space in /mars.", "See PDF manual for details.", \&emergency_limit_res, ], "get-emergency-limit" => [ "Counterpart of set-emergency-limit (per-resource emergency limit)", \&emergency_limit_res, ], "emergency-limit" => \&emergency_limit_res, "cat" => [ "usage: cat ", "Print internal debug output in human readable form.", "Numerical timestamps and numerical error codes are replaced", "by more readable means.", "Example: marsadm cat /mars/5.total.status", \&cat_cmd, ], "show" => [ "verbose 3", "Deprecated old low-level tool. Don't use. Use macros instead.", \&show_cmd, ], "show-errors" => [ "verbose 3", "Deprecated old low-level tool. Don't use. Use macros instead.", \&show_errors_cmd, ], "show-state" => [ "verbose 3", "Deprecated old low-level tool. Don't use. Use macros instead.", \&mars_state_cmd, ], "mars-state" => [ "verbose 3", "Deprecated old low-level tool. Don't use. Use macros instead.", \&mars_state_cmd, ], "show-info" => [ "verbose 3", "Deprecated old low-level tool. Don't use. Use macros instead.", \&mars_info_cmd, ], "mars-info" => \&mars_info_cmd, # deprecated "pause-replay-local" => [ "Stop replaying transaction logfiles for now.", "This is independent from any {pause,resume}-fetch operations.", "This may be used for freezing the state of your replica for some", "time, if you have enough space on /mars/.", "Only useful on a secondary node.", \&pause_replay_local_res, ], "pause-replay-global" => [ "Like pause-replay-local, but affects all resource members", "in the cluster (remotely).", \&pause_replay_global_res, ], "pause-replay" => [ "See pause-replay-local.", \&pause_replay_local_res, ], "resume-replay-local" => [ "Restart replaying transaction logfiles, when there is some", "data left.", "This is independent from any {pause,resume}-fetch operations.", "This should be used for unfreezing the state of your local replica.", "Only useful on a secondary node.", \&pause_replay_local_res, ], "resume-replay-global" => [ "Like resume-replay-local, but affects all resource members", "in the cluster (remotely).", \&pause_replay_global_res, ], "resume-replay" => [ "See resume-replay-local.", \&pause_replay_local_res, ], "set-replay" => [ "verbose 1", "VERY dangerous!", "Only for experts.", \&set_replay_res, ], "wait-umount" => [ "Wait until /dev/mars/ has disappeared in the", "cluster (even remotely).", "Useful on both primary and secondary nodes.", \&wait_umount_res, ], "wait-cluster" => [ "usage: wait-resource []", "Waits until a ping-pong communication has succeeded in the", "whole cluster (or only the members of ).", "NOTICE: this is extremely useful for avoiding races when scripting", "in a cluster.", \&wait_cluster, ], "wait-resource" => [ "usage: wait-resource ", " [[attach|fetch|replay|sync][-on|-off]]", "Wait until the given condition is met on the resource, locally.", \&wait_cond, ], "update-cluster" => [ "usage: update-cluster []", "Fetch all the links from all joined cluster hosts.", "Use this between create-resource and join-resource.", "NOTICE: this is extremely useful for avoiding races when scripting", "in a cluster.", \&update_cluster, ], "activate-guest" => [ "usage: activate-guest ", "Conditional update-cluster, so that will be locally", "known at the local machine, and mark the resource as a guest.", "Useful inbetween create-resource and join-resource.", "A guest will receive any symlink updates much more frequently.", "Prefer this over update-cluster when interested in a resource.", \&activate_guest, ], "deactivate-guest" => [ "usage: deactivate-guest ", "Precondition: the resource must not have local storage assigned.", "Useful for cleaning up a pure guest relastionship.", \&activate_guest, ], # compatible keywords (or their derivatives) "attach" => [ "Attaches the local disk (backing block device) to the resource.", "The disk must have been previously configured at", "{create,join}-resource.", "When designated as a primary, /dev/mars/\$res will also appear.", "This does not change the state of {fetch,replay}.", "For a complete local startup of the resource, use 'marsadm up'.", \&attach_res_phase0, "check preconditions", \&attach_res_phase1, "switch state", \&attach_res_phase2, "wait for effect", ], "detach" => [ "Detaches the local disk (backing block device) from the", "MARS resource.", "Caution! you may read data from the local disk afterwards,", "but ensure that no data is written to it!", "Otherwise, you are likely to produce harmful inconsistencies.", "When running in primary role, /dev/mars/\$res will also disappear.", "This does not change the state of {fetch,replay}.", "For a complete local shutdown of the resource, use 'marsadm down'.", \&attach_res_phase0, "check preconditions", "FORK", \&attach_res_phase0b, "wait for systemd device release", \&attach_res_phase1, "switch state", \&attach_res_phase2, "wait for effect", ], "resume-fetch-local" => [ "Start fetching transaction logfiles from the current", "designated primary node, if there is one.", "This is independent from any {pause,resume}-replay operations.", "Only useful on a secondary node.", \&fetch_local_res, ], "resume-fetch-global" => [ "Like resume-fetch-local, but affects all resource members", "in the cluster (remotely).", \&fetch_global_res, ], "resume-fetch" => [ "See resume-fetch-local.", \&fetch_local_res, ], "pause-fetch-local" => [ "Stop fetching transaction logfiles from the current", "designated primary.", "This is independent from any {pause,resume}-replay operations.", "Only useful on a secondary node.", \&fetch_local_res, ], "pause-fetch-global" => [ "Like pause-fetch-local, but affects all resource members", "in the cluster (remotely).", \&fetch_global_res, ], "pause-fetch" => [ "See pause-fetch-local.", \&fetch_local_res, ], "connect-local" => [ "See resume-fetch-local.", \&fetch_local_res, ], "connect-global" => [ "Like resume-fetch-local, but affects all resource members", "in the cluster (remotely).", \&fetch_global_res, ], "connect" => [ "See resume-fetch-local.", \&fetch_local_res, ], "disconnect-local" => [ "See pause-fetch-local.", \&fetch_local_res, ], "disconnect-global" => [ "Like pause-fetch-local, but affects all resource members", "in the cluster (remotely).", \&fetch_global_res, ], "disconnect" => [ "See pause-fetch-local.", \&fetch_local_res, ], "syncer" => \&ignore_cmd, "up" => [ "Shortcut for attach + resume-sync + resume-fetch + resume-replay.", \&up_res_phase0, "check preconditions", \&up_res_phase1, "switch state", \&up_res_phase2, "wait for effect", ], "down" => [ "Shortcut for detach + pause-sync + pause-fetch + pause-replay.", \&up_res_phase0, "check preconditions", "FORK", \&attach_res_phase0b, "wait for systemd device release", \&up_res_phase1, "switch state", \&up_res_phase2, "wait for effect", ], "primary" => [ "Promote the resource into primary role.", "This is necessary for /dev/mars/\$res to appear on the local host.", "Notice: by concept there can be only _one_ designated primary", "in a cluster at the same time.", "The role change is automatically distributed to the other nodes", "in the cluster, provided that the network is healthy.", "The old primary node will _automatically_ go", "into secondary role first. This is different from DRBD!", "With MARS, you don't need an intermediate 'secondary' command", "for switching roles.", "It is usually better to _directly_ switch the primary roles", "between both hosts.", "When --force is not given, a planned handover is started:", "the local host will only become actually primary _after_ the", "old primary is gone, and all old transaction logs have been", "fetched and replayed at the new designated priamry.", "When --force is given, no handover is attempted. A a consequence,", "a split brain situation is likely to emerge.", "Thus, use --force only after an ordinary handover attempt has", "failed, and when you don't care about the split brain.", "For more details, please refer to the PDF manual.", [ \&wait_cluster_noforce, ], \&primary_phase0, "check preconditions", "FORK", \&primary_phase0a, "conditionally wait for fetch off", \&primary_phase0b, "wait for systemd", "LOOP", \&primary_phase1, "leave primary state", "LOOP", \&primary_phase1b, "trigger remote", "LOOP", \&primary_phase2, "wait for cluster when necessary", "LOOP", \&primary_phase2b, "avoid split brain", \&primary_phase3, "switch to primary", \&primary_phase3b, "trigger remote", \&primary_phase4, "wait for device", "LOOP", \&primary_phase5, "trigger systemd", ], "secondary" => [ "Promote all cluster members into secondary role, globally.", "In contrast to DRBD, this is not needed as an intermediate step", "for planned handover between an old and a new primary node.", "The only reasonable usage is before the last leave-resource of the", "last cluster member, immediately before leave-cluster is executed", "for final deconstruction of the cluster.", "In all other cases, please prefer 'primary' for direct handover", "between cluster nodes.", "Notice: 'secondary' sets the global designated primary node", "to '(none)' which in turn prevents the execution of 'invalidate'", "or 'join-resource' or 'resize' anywhere in the cluster.", "Therefore, don't unnecessarily give 'secondary'!", \&primary_phase0, "check preconditions", "FORK", \&primary_phase1, "leave primary state", "LOOP", \&primary_phase1b, "trigger remote", \&primary_phase4, "wait for effect", "LOOP", \&primary_phase5, "trigger systemd", ], "invalidate" => [ "Only useful on a secondary node.", "Forces MARS to consider the local replica disk as being", "inconsistent, and therefore starting a fast full-sync from", "the currently designated primary node (which must exist;", "therefore avoid the 'secondary' command).", "This is usually needed for resolving emergency mode.", "When having k=2 replicas, this can be also used for", "quick-and-simple split-brain resolution.", "In other cases, or when the split-brain is not resolved by", "this command, please use the 'leave-resource' / 'join-resource'", "method as described in the PDF manual (in the right order as", "described there).", \&invalidate_res_phase0, "check preconditions", \&invalidate_res_phase1, "switch off everything", \&invalidate_res_phase2, "wait for everything switched off", \&invalidate_res_phase3, "start deletion of old version links", \&invalidate_res_phase4, "wait for effect", \&invalidate_res_phase5, "start purging old logfiles", \&invalidate_res_phase6, "wait for effect", \&invalidate_res_phase7, "set new replaylink", \&invalidate_res_phase8, "wait for effect", \&invalidate_res_phase9, "switch on everything again", ], "invalidate-remote" => \&forbidden_cmd, "resize" => [ "Prerequisite: all underlying disks (usually /dev/vg/\$res) must", "have been already increased, e.g. at the LVM layer (cf. lvresize).", "Causes MARS to re-examine all sizing constraints on all members of", "the resource, and increase the global logical size of the resource", "accordingly.", "Shrinking is currently not yet implemented.", "When successful, /dev/mars/\$res at the primary will be increased", "in size. In addition, all secondaries will start an incremental", "fast full-sync to get the enlarged parts from the primary.", \&resize_phase0, "check preconditions", \&resize_phase1, "set new size", \&resize_phase2, "wait for change", ], "check-resize" => \&ignore_cmd, "create-md" => \&senseless_cmd, "get-gi" => \&ignore_cmd, "show-gi" => \&ignore_cmd, "dump-md" => \&senseless_cmd, "outdate" => \&ignore_cmd, "adjust" => \&ignore_cmd, "wait-connect" => [ "usage: wait-connect []", "See wait-cluster.", \&wait_cluster, ], "role" => [ "verbose 3", "Deprecated.", "Please use the macro command 'view-role' instead.", "For even better summary information, use plain 'view'.", \&role_cmd, ], "state" => [ "verbose 3", "Deprecated.", "Please use the macro command 'view-role' instead.", "For even better summary information, use plain 'view'.", \&role_cmd, ], "cstate" => \&nyi_cmd, "dstate" => \&nyi_cmd, "status" => \&nyi_cmd, "dump" => \&senseless_cmd, "verify" => \&forbidden_cmd, "pause-sync-local" => [ "Pause the initial data sync at current stage.", "This has only an effect if a sync is actually running (i.e.", "there is something to be actually synced).", "Don't pause too long, because the local replica will remain", "inconsistent during the pause.", "Use this only for limited reduction of system load.", "Only useful on a secondary node.", \&pause_sync_local_res, ], "pause-sync-global" => [ "Like pause-sync-local, but affects all resource members", "in the cluster (remotely).", \&pause_sync_global_res, ], "pause-sync" => [ "See pause-sync-local.", \&pause_sync_local_res, ], "resume-sync-local" => [ "Resume any initial / incremental data sync at the stage where it", "had been interrupted by pause-sync.", "Only useful on a secondary node.", \&pause_sync_local_res, ], "resume-sync-global" => [ "Like resume-sync-local, but affects all resource members", "in the cluster (remotely).", \&pause_sync_global_res, ], "resume-sync" => [ "See resume-sync-local.", \&pause_sync_local_res, ], "new-current-uuid" => \&senseless_cmd, "hidden-commands" => \&ignore_cmd, # lowlevel tools "lowlevel-ls-host-ips" => [ "usage: lowlevel-ls-host-ips", "List cluster member names and IP addresses.", \&lowlevel_ls_host_ips, ], "lowlevel-set-host-ip" => [ "usage: lowlevel-set-host-ip []", "Set IP address for host.", "When is not given, try to determine the old address", "from the symlink tree, or from old backups.", "Often, you want to set a new IP address in place of an old one.", "Hint: you may also use the --ip-= option.", \&lowlevel_set_host_ip, ], "lowlevel-delete-host" => [ "usage: lowlevel-delete-host ", "Delete cluster member.", \&lowlevel_delete_host, ], # systemd interface "systemd-trigger" => [ "usage: systemd-trigger []", \&systemd_trigger_extern, ], "systemd-trigger-extern" => [ \&systemd_trigger_extern, ], ); sub helplist { my $msg = shift; print "ERROR: $msg" if ($msg); print " Thorough documentation is in mars-user-manual.pdf. Please use the PDF manual as authoritative reference! Here is only a short summary of the most important sub-commands / options: marsadm [] [ | all | ] marsadm [] view[-] [ | all ] = --force Skip safety checks. Use this only when you really know what you are doing! Warning! This is dangerous! First try --dry-run. Not combinable with 'all'. --ignore-sync Allow primary handover even when some sync is running somewhere. This is less rude than --force because it checks for all else preconditions. --ignore-deleted-peers= 0 = off 1 = only ignore deleted peers at reports (default) 2 = Only for EXPERTS. This may be dangerous. Ignorance about the existence of a peer may be very harmful in a Distributed System. --dry-run Don't modify the symlink tree, but tell what would be done. Use this before starting potentially harmful actions such as 'delete-resource'. --verbose Increase speakyness of some commands. --parallel Only resonable when combined with \"all\". For each resource, fork() a sub-process running independently from other resources. May seepd up handover a lot. However, several cluster managers are known to have problems with a high parallelism degree (up to deadlocks). Only use this after thorough testing in combination with your whole operation stack! Turns off --singlestep. --parallel= Like --parallel, but limit the parallelism degree to the given number of parallel processes. Turns off --singlestep. --singlestep Debugging aid for multi-phase commands. Interactively step through the various phases of commands. Turns off --parallel. --error-injection-phase= Only for testing. NEVER use in production. --delete-method= EXPERIMENTAL! Only for testing! This option will disappear again! == 0: Use new deletion method == 1: Use old deletion method default is 1 for compatibility. --logger=/path/to/usr/bin/logger Use an alternative syslog messenger. When empty, disable syslogging. --max-deletions= When your network or your firewall rules are defective over a longer time, too many deletion links may accumulate at /mars/todo-global/delete-* and sibling locations. This limit is preventing overflow of the filesystem as well as overloading the worker threads. --thresh-logfiles= --thresh-logsize= Prevention of too many small logfiles when secondaries are not catching up. When more than thresh-logfiles are already present, the next one is only created when the last one has at least size thresh-logsize (in units of GB). --timeout= Current default: $timeout Abort safety checks and waiting loops after timeout with an error. When giving 'all' as resource agument, this works for each resource independently. The special value -1 means \"infinite\". --window= Current default: $window Treat other cluster nodes as healthy when some communcation has occured during the given time window. --keep-backup-hours= --keep-backups= link-purge-all and cron will delete old backup files and old symlinks after this number of hours. Current default: $keep_backup_hours --threshold= Some macros like 'fetch-threshold-reached' use this for determining their sloppyness. --systemd-enable=<0|1> Enable / disable any systemd actions. On by default. --host= Act as if the command was running on cluster node . Warning! This is dangerous! First try --dry-run --backup-dir= Only for experts. Used by several special commands like merge-cluster, split-cluster etc for creating backups of important data. --ip-= Override the IP address of from the symlink tree, or as determined from old IP backups, or as determined from the list of network interfaces. Usually you will need this only at 'create-cluster' or 'join-cluster' / 'merge-cluster' / 'split-cluster' for resolving ambiguities, or for telling the IP address of yet unknown peers. It is also useful at 'lowlevel-set-host-ip' for updating an already existing IP address. Hint: this option may be given multiple times for different parts. --ip= Equivalent to --peer-\$host= where \$host is usually the same as \$(hostname), but you may use --host= as an _earlier_ argument for overriding the default . --ssh-port= Override the default ssh port (22) for ssh and rsync. Useful for running {join,merge}-cluster on non-standard ssh ports. --no-ssh Equivalent to --ssh-port=0 Disable ssh and rsync completely. Dead peers / interrupted networks / firewalling may lead to (temporary) hangs of ssh probes, which are used by default for backwards compatibility. Hint: ssh_config options like ConnectTimeout may also help. Use this to disable any probes, and no time loss. --ssh-opts=\"\" Override the default ssh commandline options. Also used for rsync. --macro= Handy for testing short macro evaluations at the command line. = [a-z][-a-z0-9]* with the exception of reserved names: $reserved_names = "; my $item_count = 0; foreach my $cmdname (sort alphanum_cmp keys(%cmd_table)) { my $list = $cmd_table{$cmdname}; next unless ref($list) eq "ARRAY"; my @copy = @$list; my $line_count = 0; my $verb_level = 0; while (my $txt = shift @copy) { last if ref($txt) eq "CODE"; next if ref($txt) eq "ARRAY"; next if $txt =~ m/^[A-Z]+$/; my $skip_this = 0; if ($txt =~ m/^verbose *([0-9]+)/) { $verb_level = $1; $skip_this = 1; } if ($verbose < 0) { last if $verb_level != -$verbose; } else { last if $verb_level > $verbose; } next if $skip_this; if (!$line_count) { print "\n" if $item_count++; print " $cmdname\n"; print " usage: $cmdname \n" if $txt !~ m/usage:/; } print " $txt\n"; $line_count++; } } print " = comma-separated list of resource names or \"all\" for all resources "; print " = | = "; foreach my $macroname (sort alphanum_cmp keys(%complex_macros)) { print " $macroname\n"; } print " = "; foreach my $glob (sort alphanum_cmp keys(%trivial_globs)) { my $txt = $trivial_globs{$glob}; print " $glob\n"; print " $txt\n" if $txt; } print " = "; my $nr = 0; foreach my $flagname (sort alphanum_cmp keys(%FEATURES_FLAGS)) { print " |\n" if ($nr++); print " $flagname"; } print "\n"; exit 0; } my @args; my @check_ids; foreach my $arg (@ARGV) { if ($arg eq "--force" || $arg eq "-f") { $force++; next; } elsif ($arg eq "--ignore-sync") { $ignore_sync++; next; } elsif ($arg =~ m/--ignore-deleted-peers\s*=\s*(-?[0-9]+)/) { $ignore_deleted_peers = $1; next; } elsif ($arg eq "--dry-run" || $arg eq "-d") { $dry_run++; next; } elsif ($arg =~ m/--max-deletions\s*=\s*(-?[0-9]+)/) { $max_deletions = $1; next; } elsif ($arg =~ m/--thresh-logfiles\s*=\s*([0-9]+)/) { $thresh_logfiles = $1; next; } elsif ($arg =~ m/--thresh-logsize\s*=\s*([0-9]+)/) { $thresh_logsize = $1; next; } elsif ($arg =~ m/--parallel(\s*=\s*(-?[0-9]+)?)?/) { $parallel = defined($2) ? $2 : 0; $single_step = 0; next; } elsif ($arg eq "--singlestep") { $single_step++; $parallel = -1; next; } elsif ($arg =~ m/--error-injection-phase\s*=\s*([0-9]+)/) { $inject_phase = $1; next; } elsif ($arg =~ m/--delete-method\s*=\s*([0-9]+)/) { $compat_deletions = $1; my $compat_path = "$mars/compat-deletions"; my $old = get_link($compat_path, 2); if (!defined($old) || $old ne $compat_deletions) { if (-d "/mars" && !$dry_run) { lprint "Setting '$compat_path' to '$compat_deletions'\n"; set_link($compat_deletions, $compat_path); finish_links(); } else { lwarn "Cannot set '$compat_path' to '$compat_deletions'\n"; } } next; } elsif ($arg =~ s/--verbose\s*=\s*(-?[0-9]+)/$1/) { $verbose = $arg; next; } elsif ($arg eq "--verbose" || $arg eq "-v") { $verbose++; next; } elsif ($arg =~ s/--logger\s*=\s*(.*)/$1/) { $logger = $arg; next; } elsif ($arg =~ s/--autoclean\s*=\s*([0-9]+)/$1/) { $cron_autoclean_days = $arg; next; } elsif ($arg =~ s/--timeout\s*=\s*([0-9]+)/$1/) { $timeout = $arg; next; } elsif ($arg =~ s/--window\s*=\s*([0-9]+)/$1/) { $window = $arg; next; } elsif ($arg =~ s/--keep-backup(?:-hour)?s\s*=\s*([0-9]+)/$1/) { $keep_backup_hours = $arg; next; } elsif ($arg =~ s/--threshold\s*=\s*([0-9]+)/$1/) { $threshold = get_size($arg); next; } elsif ($arg =~ s/--systemd-enable\s*=\s*([0-9]+)?/$1/) { $systemd_enabled = $1; $cm3_checked = 1; next; } elsif ($arg =~ s/--host\s*=\s*([-_A-Za-z0-9]+)/$1/) { push @check_ids, $arg; if ($arg ne $host) { lprint "ATTENTION: acting as if I were host '$arg'\n"; lwarn "some commands require local knowledge not available here.\n"; lwarn "thus something may fail or go wrong - use this at your risk!\n"; $host = $arg; } next; } elsif ($arg =~ m/--backup-dir\s*=\s*(\/[^\s\\:;|<>]+)/) { $backup_dir = $1; system("mkdir -p $backup_dir") and ldie "Cannot create backup directory '$backup_dir'\n"; next; } elsif ($arg =~ m/--ip(-(.*?))?\s*=\s*([0-9.:\[\]]+)/) { my $peer = $2; my $ip = $3; $peer = $host unless $peer; lprint_stderr "Using IP '$ip' from command line for '$peer'.\n"; $known_ips{$peer} = $ip; next; } elsif ($arg =~ m/--no-ssh/) { $ssh_port = 0; next; } elsif ($arg =~ s/--ssh[-_]port\s*=\s*([0-9]+)/$1/) { $ssh_port = $arg; lprint_stderr "Using SSH port '$ssh_port' from command line.\n"; next; } elsif ($arg =~ s/--ssh[-_]opts\s*=\s*(.*)/$1/) { $ssh_opts = $arg; lprint_stderr "Using SSH options '$ssh_opts' from command line.\n"; next; } elsif ($arg =~ s/--macro\s*=\s*(.*)/$1/) { $macro = $arg; $macro =~ s/\\n/\n/mg; next; } elsif ($arg =~ m/^--help$/ || $arg =~ m/^-h$/) { helplist; next; } elsif ($arg =~ m/^--version$/ || $arg =~ m/^-v$/) { version; next; } elsif ($arg =~ m/^-(.*)/) { ldie "unrecognized argument '-$1' (bad syntax)\n"; } if ($arg =~ s/^force-//) { $force++; } push @args, $arg; } # some postponed checks foreach my $check_arg (@check_ids) { check_id($check_arg, 1, 1); } my $cmd = shift @args || helplist "command argument is missing\n"; if ($cmd =~ m/^help$/ || $cmd =~ m/^h$/) { helplist; } if ($cmd =~ m/^version$/ || $cmd =~ m/^v$/) { version; } ldie "only root may use this tool\n" if $< != 0 && $cmd !~ m/^(cat|view.*|pretty.*)$/; # getpid() seems to be missing in perlfunc helplist "unknown command $cmd\n" if (!exists $cmd_table{$cmd} && !$cmd =~ m/view/); # setup syslogging if ($cmd !~ m/^(version$|v$|view)/ && -x $logger) { $notify = "(cmd: $cmd)"; my $print_id = $Id; $print_id =~ s/\$|Id:| //g; $print_id = substr($print_id, 0, 8); llog "$print_id $host $0 @ARGV\n"; } # checks ldie "The $mars directory does not exist.\n" unless -d $mars; my @mars_stat = stat($mars) or ldie "Cannot stat '$mars'\n"; # check uid if ($mars_stat[4]) { lwarn "Directory $mars has wrong owner uid $mars_stat[4].\n"; chown(0, 0, $mars) or ldie "Cannot chown $mars.\n"; lwarn "Fixed by setting to 0.\n"; } # check permissions my $perms = ($mars_stat[2] & 07777); if ($perms != 0700) { my $txt = sprintf("Directory $mars has wrong permissions 0%o.\n", $perms); lwarn($txt); chmod(0700, $mars) or ldie "Cannot chmod $mars.\n"; lwarn "Fixed to mode 0700 for security reasons.\n"; } # lowlevel tools if ($cmd =~ m/^lowlevel-/) { my $func = $cmd_table{$cmd}; if (!defined($func) || !$func) { ldie "Syntax error: lowlevel command '$cmd' not found\n"; } if (ref($func) eq "ARRAY") { lwarn "EXPERTS ONLY -- risky lowlevel command '$cmd'\n"; my @list = @$func; while (@list) { my $memb_func = shift @list; next unless ref($memb_func) eq "CODE"; &{$memb_func}($cmd, @args); } finish_links(); exit(0); } else { ldie "Internal error: command table is wrong for '$cmd'"; } } if ($cmd !~ m/(create|join)-cluster|create-uuid|systemd|cat|view|pretty/) { my $uuid = get_link("$mars/uuid", 1); ldie "No valid cluster UUID $mars/uuid found. You need {create,join}-cluster first (or create-uuid).\n" unless $uuid; } my $res = ""; if ($cmd =~ "show|cron") { $res = shift @args || "all"; } elsif ($cmd =~ m/^(view|pretty)|trigger/) { $res = shift @args || ""; } elsif ($cmd =~ m/^set-.*-list$/) { $res = shift @args || helplist "comma-separated list argument is missing\n"; } elsif ($cmd =~ m/^set-.*-value$/) { $res = shift @args; if (!defined($res) || $res eq "") { helplist "numeric argument for '$cmd' is missing\n"; } ldie "argument '$res' isn't numeric\n" unless $res =~ m/^[0-9.]+$/; } elsif ($cmd =~ m/^(set|get)-global-/) { $res = ""; } elsif ($cmd =~ m/^set-/) { $res = shift @args || helplist "resource argument is missing\n"; check_id_list($res, 0, 1, 1); } elsif ($cmd =~ m/^(join|merge)-cluster$/) { $res = shift @args || helplist "peer argument is missing\n"; } elsif (!($cmd =~ m/^(create|split|leave|wait|update)-cluster|merge-cluster-list|create-uuid|cat|[a-z]+-file/)) { $res = shift @args || helplist "resource argument is missing\n"; check_id_list($res, 0, 1, 1); } lwarn "Using FORCE option -- hopefully you know what you are doing!\n" if $force; my %checked_res; sub do_one_res { my $func = shift; my ($cmd, $res) = @_; if ($inject_phase && $phase_nr == $inject_phase) { ldie "ERROR INJECTION cmd='$cmd' res='$res' phase='$phase_nr'\n"; } if ($cmd =~ m/^cat|^(set|get)-global-|-file$|-list$|-link$|-value$/) { # no resource argument } elsif (!$checked_res{"$cmd$res"}) { $res = check_res($res) unless (!$res || $cmd =~ m/^(join|create|merge|leave|wait)-cluster|(create|join)-resource|show/); check_res_member($cmd, $res) unless (!$res || $cmd =~ m/^(join|create|delete)-(cluster|resource)|^(merge|leave|wait)-cluster|activate-guest|-purge-|^show|^view/); detect_splitbrain($res, 1); $checked_res{"$cmd$res"} = 1; } call_hook(!$force, "pre", @_); my $status = &{$func}(@_); call_hook(!$force, "post", @_); return $status; } sub expand_res_list { my ($cmd, $res) = @_; my @res_list=(); if ($res eq "all" && $cmd !~ m/show|cat|cluster|set-link|delete-file/) { @res_list = get_any_resources($host); } elsif ($res =~ m/,/) { @res_list = split(",", $res); } # check for any systemd activations once if ($parallel <= -999) { $parallel = -998; my $count = scalar(@res_list); if ($count > 1) { my $systemd_activated = 0; foreach my $this_res (@res_list) { if (systemd_present($cmd, $this_res)) { $systemd_activated++; last; } } if ($systemd_activated) { lprint "Systemd is activated, acting like --parallel=0\n" if $verbose; $parallel = 0; } } } return sort alphanum_cmp @res_list; } sub do_all_res { my $func = shift; my $do_abort = shift; my $cmd = shift; my $res = shift; my @res_list = expand_res_list($cmd, $res); if (@res_list) { ldie "Cannot combine command '$cmd' with 'all' existing resources - you must explicitly name a single new resource\n" if $cmd =~ m/create|join/; my $any_success = 0; my $any_fail = 0; my $any_member = 0; call_hook(!$force, "all-pre", $cmd, "all", @_) if $do_abort; foreach $res (@res_list) { $any_member++; $res =~ s/^.*\/resource-(.*)$/$1/; next if defined($skip_res{$res}); if ($verbose || $cmd !~ m/^log-/) { my $tpl = get_macro("default-resource"); my $hint = eval_macro($cmd, $res, $tpl, @_); my $type = "guest"; $type = "resource" if is_member($res, $host); lprint "--------- $type $hint\n"; } if (!$do_abort) { # LOOP RETRY mode # Retry when exit code is not 0 # catch internal ldie() via eval{} failure my $status = 1; my $has_died = 0; eval { $status = do_one_res($func, $cmd, $res, @_); 1; } or # eval{} has failed ( $has_died = 1 ); $any_fail++ if $status; # Only ldie() implies future skipping of this resource. if ($has_died && !$skip_res{$res}) { $any_fail++; $skip_res{$res} = 1; fail_action($cmd, $res); } next; } # NO LOOP RETRY # Any non-zero exit code means to skip this resource in future. # catch internal ldie() via eval{} failure my $has_died = 0; eval { do_one_res($func, $cmd, $res, @_); 1; } and # eval{} has succeeded $any_success = 1 or # eval{} has failed ( $has_died = 1 ); if ($has_died && !$skip_res{$res}) { $skip_res{$res} = 1; fail_action($cmd, $res); } } return $any_fail unless $do_abort; if (!$any_success) { if (!$any_member) { lprint "I am not member/guest of any resource\n"; return 1; } ldie "all resources have errors\n"; } call_hook(!$force, "all-post", $cmd, "all", @_); return !$any_success; } elsif ($res eq "all") { lwarn "resource qualifier 'all' does not match any resource or guest names\n"; return 0; } elsif (!$do_abort) { return do_one_res($func, $cmd, $res, @_); } else { my $has_died = 0; my $status = 0; eval { $status = do_one_res($func, $cmd, $res, @_); 1; } or # eval{} has failed ( $has_died = 1 ); if ($has_died && !$skip_res{$res}) { fail_action($cmd, $res); $status = 1; } return $status; } } if ($cmd =~ m/^(view|pretty)/) { lwarn "mars kernel module is not loaded\n" unless is_module_loaded(); if ($res eq "all" && $cmd =~ m/^view-?(.*)/) { my $global_macro_name = $1 ? "$1-header" : "default-header"; my $global_macro = get_macro($global_macro_name, 1); view_cmd($global_macro_name, "", @args) if $global_macro; } do_all_res(\&view_cmd, 1, $cmd, $res, @args); if ($res eq "all" && $cmd =~ m/^view-?(.*)/) { my $prefix = $1; foreach my $suffix ("global", "footer") { my $global_macro_name = $prefix ? "${prefix}-$suffix" : "default-$suffix"; my $global_macro = get_macro($global_macro_name, 1); view_cmd($global_macro_name, "", @args) if $global_macro; } } finish_links(); # report any dead peers my $report_foreign = ($ignore_deleted_peers > 1); get_alive_links("all", "alive", "*", $report_foreign, 0); exit($error_count); } my $func = $cmd_table{$cmd}; ldie "unknown command '$cmd'\n" unless $func; my %kid_res; sub wait_pid_list { my @pid_list = @_; foreach my $pid (@pid_list) { my $check_pid = waitpid($pid, 0); my $status = $?; my $sub_res = $kid_res{$pid}; if ($status > 0) { lwarn "RESOURCE $sub_res CHILD $pid terminated with status=$status\n"; $error_count++ if $status; } elsif ($check_pid == $pid) { lprint_stderr "RESOURCE $sub_res CHILD $pid terminated successfully\n"; } else { lwarn "RESOURCE $sub_res CHILD $pid terminated with unknown state\n"; } } } sub wait_any_pid { my ($flags) = @_; lprint_stderr "WAITING for termination of a child...\n"; my $pid = waitpid(-1, $flags); my $status = $?; if ($pid > 0) { my $sub_res = $kid_res{$pid}; if ($status != 0) { lwarn "RESOURCE $sub_res CHILD $pid terminated with status=$status\n"; $error_count++ if $status; } else { lprint_stderr "RESOURCE $sub_res CHILD $pid terminated successfully\n"; } } return $pid; } sub do_fork { my @res_list = expand_res_list($cmd, $res); my $child_count = 0; # only fork() when beneficial if (@res_list && scalar(@res_list) > 1) { lprint_stderr "FORKING (error_count=$error_count)\n"; $error_count = 0; foreach my $child_res (@res_list) { # when necessary, limit the parallelism degree if ($parallel > 0 && $child_count >= $parallel) { my $done_pid = wait_any_pid(0); if ($done_pid > 0) { $child_count--; delete $kid_res{$done_pid}; } } my $pid = fork(); ldie "Cannot fork()\n" unless defined($pid); if ($pid) { lprint_stderr "RESOURCE $child_res starting CHILD $pid\n"; $child_count++; $kid_res{$pid} = $child_res; } else { # child: simply continue with new $res $res = $child_res; $child_count = 0; $child_prefix = "CHILD $child_res: "; if ($skip_res{$res}) { ldie "SKIPPING\n"; } lprint_stderr "STARTING\n"; last; } } } if ($child_count) { my @wait_list = sort alphanum_cmp keys(%kid_res); wait_pid_list(@wait_list); lprint_stderr "EXIT $error_count\n"; exit($error_count); } } if (ref($func) eq "ARRAY") { my @list = @$func; while (@list) { my $do_loop = 0; my $memb_func = shift @list; # check whether fork() is possible if ($memb_func) { if ($memb_func eq "PARALLEL") { if ($parallel < -99) { lprint "Treating as --parallel=0\n" if $verbose; $parallel = 0; } next; } if ($memb_func eq "FORK") { $memb_func = shift @list; # check whether fork() is requested if ($parallel >= 0) { do_fork(); } } # check whether busy-waiting loop is requested if ($memb_func eq "LOOP") { $memb_func = shift @list; $do_loop++; } elsif ($memb_func eq "SLEEP") { $memb_func = shift @list; sleep(7); } elsif ($memb_func eq "CHANGE_RESOURCES") { $res = shift @list; next; #$memb_func = shift @list; } } # nested arrays may be used for _global_ workers if (ref($memb_func) eq "ARRAY") { my $global_msg = ""; my @sub_list = @$memb_func; while (my $sub_func = shift @sub_list) { my $ref_type = ref($sub_func); if ($ref_type eq "" && $sub_func) { $global_msg .= $sub_func; next; } next unless $ref_type eq "CODE"; while ($sub_list[0] && ref($sub_list[0]) eq "") { $global_msg .= shift @sub_list; } if ($global_msg) { $phase_nr++; lprint "-------- GLOBAL PHASE $phase_nr -------- $global_msg:\n"; $global_msg = ""; } &{$sub_func}($cmd, $res, @args); } } next unless ref($memb_func) eq "CODE"; my $headline = shift @list; $phase_nr++; lprint "-------- PHASE $phase_nr -------- $headline:\n" if defined($headline); $headline = "" unless defined($headline); if ($single_step) { ldie "DEBUGGING: standard input is no tty\n" unless -t STDIN; lprint "DEBUGGING: type RETURN for starting phase $phase_nr: "; my $dummy = ; lprint "DEBUGGING: continue phase $phase_nr: $headline\n"; } my $start_time = mars_time(); my $do_abort = !$do_loop; my $status; my $count = 0; for (;;) { $status = do_all_res($memb_func, $do_abort, $cmd, $res, @args); last if !$do_loop; last if (!defined($status) || !$status); # we have a busy wait condition $count++; lprint "--- status='$status' check again LOOP $count\n" if $verbose; sleep(1); my $now = mars_time(); if ($now - $start_time > $timeout) { lwarn "Condition '$headline' for resources '$res' not reached withing $timeout s\n"; last; } } finish_links(); if (defined($status) && $status) { lprint_stderr "STATUS='$status'\n" if $verbose; last; } } } elsif (ref($func) eq "CODE") { do_all_res($func, 1, $cmd, $res, @args); } else { ldie "internal error: command table is wrong for '$cmd'"; } finish_links(); # do not disturb stdout of informational commands lprint_stderr "EXIT $error_count\n" if $verbose; exit($error_count);