mars/userspace/marsadm
2020-12-22 08:58:02 +01:00

9724 lines
301 KiB
Perl
Executable File

#!/usr/bin/perl -w
#
# MARS Long Distance Replication Software
#
# This file is part of MARS project: http://schoebel.github.io/mars/
#
# Copyright (C) 2010-2014 Thomas Schoebel-Theuer
# Copyright (C) 2011-2014 1&1 Internet AG
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License along
# with this program; if not, write to the Free Software Foundation, Inc.,
# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
use strict;
use English;
use warnings;
umask 0077;
##################################################################
# global defaults
my $parallel = -1;
my $single_step = 0;
my $inject_phase = 0;
my $compat_deletions = 1;
my $compat_alivelinks = 1;
my $threshold = 10 * 1024 * 1024;
my $window = 60;
my $keep_backups = 24 * 7;
my $verbose = 0;
my $max_deletions = 512;
my $thresh_logfiles = 10;
my $thresh_logsize = 5; # GB
my $dry_run = 0;
my @MARS_PATH = $ENV{MARS_PATH} ?
split(/:/, $ENV{MARS_PATH}) :
(
".",
defined($ENV{HOME}) ? "$ENV{HOME}/.marsadm" : "",
"/etc/marsadm",
"/usr/lib/marsadm",
"/usr/local/lib/marsadm",
);
my $marsadm_var_dir = defined($ENV{MARSADM_VRA_DIR}) ?
$ENV{MARSADM_VAR_DIR} :
"/var/marsadm";
##################################################################
# messaging
my %skip_res;
my $error_count = 0;
my $notify = "";
my $child_prefix = "";
my $logger = "/usr/bin/logger";
sub llog {
my ($text) = @_;
if ($notify) {
$text =~ s/"/\\"/g;
system("$logger -t marsadm \"$notify $text\"");
}
}
sub lprint {
my ($text) = @_;
$OUTPUT_AUTOFLUSH = 1;
print $child_prefix . $text;
llog($text);
}
sub lprint_stderr {
my ($text) = @_;
$OUTPUT_AUTOFLUSH = 1;
print STDERR $child_prefix . $text;
llog($text);
}
sub lskip {
my ($res, $text) = @_;
# Already set %skip_res here, so it doesn't count as error
$skip_res{$res} = 1;
lprint_stderr "SKIPPING: $text";
# trigger an exception
die "\n";
}
sub ldie {
my ($text) = @_;
if ($verbose > 2) {
my $i = 1;
for (;;) {
my ($package, $filename, $line, $subroutine, $hasargs, $wantarray, $evaltext, $is_require, $hints, $bitmask, $hinthash) = caller($i++);
last unless defined($subroutine);
lprint_stderr "$line:$subroutine\n";
}
}
$error_count++;
lprint_stderr "DYING: $text";
llog("DYING: $text");
die "\n";
}
sub lwarn {
my ($text) = @_;
lprint_stderr "WARNING: $text";
llog("WARNING: $text");
}
##################################################################
# basic helpers
sub lamport_glob {
my $expr = shift;
my @result =();
my @raw_glob = glob($expr);
foreach my $path (@raw_glob) {
my $val = readlink($path);
next if (defined($val) && $val eq ".deleted");
push @result, $path;
}
return @result;
}
sub safe_creat {
my $path = shift;
if (-l $path) {
my $val = readlink($path);
unlink($path) if $val eq ".deleted";
}
my $fh = undef;
use IO::Handle;
use Fcntl;
my $status = sysopen($fh, $path, O_CREAT | O_EXCL | O_NOFOLLOW | O_RDONLY);
close($fh);
return (defined($status) && $status);
}
sub safe_touch {
my ($path, $stamp) = shift;
use IO::Handle;
if (-l $path) {
my $val = readlink($path);
return 0 if $val eq ".deleted";
}
return utime($stamp, $stamp, $path);
}
sub link_exists {
my $path = shift;
return 0 unless -l $path;
my $val = readlink($path);
return 0 unless defined($val);
return 0 if $val eq ".deleted";
return 1;
}
sub file_exists {
my $path = shift;
if (-l $path) {
my $val = readlink($path);
return 0 if $val eq ".deleted";
}
return 1 if -f $path;
return 0;
}
sub any_exists {
my $path = shift;
return 1 if link_exists($path);
return 1 if file_exists($path);
return 0;
}
##################################################################
# global variables
my $Id = '$Id$ ';
my $user_version = 0.1;
my $marsadm_version = 2.9; # some rough hint at newer features
my $mars = "/mars";
my $host = `uname -n` or ldie "cannot determine my network node name\n";
chomp $host;
check_id($host);
my $real_host = $host;
my $backup_dir = "$mars/backups-" . time();
my $force = 0;
my $ignore_sync = 0;
my $cron_mode = 0;
my $timeout = 600;
my $phase_nr = 0;
my $ssh_port = 22;
my $ssh_opts = "-A -o StrictHostKeyChecking=no -o ConnectTimeout=5";
my $ssh_probe = "uname -a";
my $rsync_opts = "-av --exclude='.deleted-*'";
my $kernel_version = 0;
my $kernel_features_version = -1;
my $kernel_strategy_version = -1;
my $kernel_flags_version = ~0x0;
my %known_ips;
##################################################################
# general helpers
my $allow_fail_action = undef;
# Failure compensation.
# This is called when do_one_res() fails
sub fail_action {
my ($cmd, $res) = @_;
lprint "FAILURE cmd='$cmd' res='$res'\n";
$error_count++;
return unless $allow_fail_action;
lprint "===== BEGIN FAILURE COMPENSATION cmd='$cmd' res='$res'\n";
# prevent recursive ldie
eval {
&$allow_fail_action(@_);
my $sub_status = $?;
lprint "COMPENSATION sub_status='$sub_status'\n";
1;
} or
(
lwarn "DOUBLE FAILURE detected.\n"
);
lprint "===== END FAILURE COMPENSATION cmd='$cmd' res='$res'\n";
}
sub _device_name {
my ($res, $peer) = @_;
$peer = $host unless defined($peer);
my $name = get_link("$mars/resource-$res/device-$peer", 1);
$name = $res if (!$name || $name eq "(none)");
return $name;
}
sub device_name {
my $name = _device_name(@_);
$name = "/dev/mars/$name" if (defined($name) && $name);
return $name;
}
sub device_exists {
my ($res, $peer) = @_;
$peer = $host unless defined($peer);
return 0 if $peer eq "(none)";
if ($peer eq $real_host) {
# Silent fallback to local detection for old kernel module versions
my $buildtag = get_alive_link("buildtag", $peer, 1);
if (!$buildtag) {
# VERY old MARS modules dont report their version
$buildtag = `cut -d' ' -f1 < /proc/sys/mars/version`;
# Sometimes "never touch a running system" is a BAD strategy...
lwarn "Please upgrade your EXTREMELY OLD module version '$buildtag'\n" if $buildtag;
}
if (defined($buildtag) && $buildtag =~ m/([0-9]+)($|\()/ && $1 <= 97) {
my $name = device_name($res, $peer);
my $val = (-b $name) ? 1 : 0;
return $val;
}
}
my $lnk = "$mars/resource-$res/actual-$peer/if-on";
my $val = get_link($lnk, 2);
# backwards compatibility to old kernels
my $lnk_old = "$mars/resource-$res/actual-$peer/device-$peer";
my $val_old = get_link($lnk_old, 2);
if (defined($val_old) && $val_old ne "" &&
(!defined($val) || $val eq "" ||
get_link_stamp($lnk_old) > get_link_stamp($lnk))) {
$val = $val_old;
}
if (!defined($val) || $val eq "") {
# Fallback to local device
my $name = device_name($res, $peer);
if ($peer eq $real_host) {
lwarn "Unexpected fallback to local device detection $name for $peer\n";
$val = (-b $name) ? 1 : 0;
} else {
lwarn "Cannot determine device presence for $peer\n";
}
}
return $val;
}
##################################################################
# ssh helpers
# deprecated, to disappear in a future release
my %ssh_ips;
my %ssh_ports;
sub make_ssh_cmd {
my ($peer, $no_login) = @_;
return "" unless $ssh_port;
my $ssh = "ssh $ssh_opts";
if (!defined($ssh_ports{$peer})) {
my $port;
my $real_peer;
my $peer_ip = get_link("$mars/ips/ip-$peer", 2);
# workaround firewall hell at some installations...
RETRY:
for my $this_peer ($peer, $peer_ip) {
next unless defined($this_peer);
next unless $this_peer;
# check whether machine is reachable
if (system("ping -c1 $this_peer")) {
lwarn "cannot ping '$this_peer'\n";
next;
}
# first try given parameters, then port 22, then ssh_config defaults
my %seen = ();
foreach my $try_port ($ssh_port, 22, 0) {
next if defined($seen{$try_port});
$seen{$try_port} = 1;
if ($try_port && system("if which nc; then nc -v -4 -w 5 \"$this_peer\" $try_port < /dev/null > /dev/null; fi")) {
lwarn "SSH port '$try_port' for '$this_peer' is not reachable\n";
next;
}
my $opt_port = $try_port ? "-p $try_port" : "";
if (!system("$ssh $opt_port root\@$this_peer \"$ssh_probe\"")) {
($real_peer, $port) = ($this_peer, $try_port);
last RETRY;
}
lwarn "SSH to '$this_peer' does not work on port '$try_port'\n";
}
}
ldie "SSH to '$peer' does not work\n" unless defined($port);
$ssh_ips{$peer} = $real_peer;
$ssh_ports{$peer} = $port;
}
my $real_peer = $ssh_ips{$peer};
$real_peer = $peer if !defined($real_peer);
my $port = $ssh_ports{$peer};
$ssh .= " -p $port" if $port > 0;
return ($real_peer, $port, $ssh) if defined($no_login);
return "$ssh root\@$real_peer"
}
sub ssh_cmd {
my ($peer, $cmd, $no_fail) = @_;
return 1 unless $ssh_port;
if ($peer ne $real_host) {
$cmd =~ s/'/\'/g;
$cmd = make_ssh_cmd($peer) . " '$cmd'";
} else {
$cmd =~ s/'/\'/g;
$cmd = "bash -c '$cmd'";
}
ldie "empty command\n" unless $cmd;
my $status = system($cmd);
if ($status and !defined($no_fail)) {
ldie "SSH to '$peer' command '$cmd' failed\n";
}
return $status;
}
sub rsync_cmd {
my ($peer, $cmd, $no_fail) = @_;
return 1 unless $ssh_port;
my ($real_peer, $port, $ssh) = make_ssh_cmd($peer, 1);
$cmd =~ s/(?<![-a-zA-Z0-9_])$peer(?![-a-zA-Z0-9_])/$real_peer/g if (defined($real_peer) && $real_peer && $real_peer ne $peer);
my $rsync_cmd = "rsync -e '$ssh' $rsync_opts $cmd";
if ($dry_run) {
lprint "DRY_RUN: would execute the following command:\n$rsync_cmd\n";
$rsync_cmd .= " --dry-run";
}
finish_links();
# Repeat the action because some symlinks might be updated (or even vanish)
# when the peer is actively running. There seems to exist no rsync option
# for ignoring all of these errors.
my $cycle = 0;
my $status = 0;
do {
$status = system($rsync_cmd);
} while ($status != 0 && $status != 24 && $cycle++ < 3);
ldie "Cannot get remote symlink tree via rsync, status=$status\n" if (!defined($no_fail) && $status && $status != 24);
return $status;
}
##################################################################
# external hook scripts
sub _try_exec {
my ($path, $cmd, $args, $abort) = @_;
return unless -x $path;
lprint "RUNNING HOOK $path $cmd $args\n" if $verbose;
unless ($dry_run) {
my $status = system("$path $cmd $args");
lprint "HOOK status=$status\n" if $verbose;
if ($status) {
lwarn "HOOK $path $cmd $args\nFAILURE $status\n";
ldie if $abort;
}
}
}
sub _try_exec_dir {
my ($subdir, $cmd, $args, $abort) = @_;
return unless -d $subdir;
foreach my $sub_path (lamport_glob("$subdir/*")) {
next if $sub_path =~ m:~$:;
_try_exec($sub_path, $cmd, $args, $abort);
}
}
sub call_hook {
my $abort = shift;
my $type = shift;
my $cmd = shift;
my $args = join(" ", @_);
my $pattern = "hook-$type";
foreach my $dir (@MARS_PATH) {
-d $dir || next;
_try_exec("$dir/$pattern-$cmd", $cmd, $args, $abort);
_try_exec("$dir/$pattern", $cmd, $args, $abort);
_try_exec_dir("$dir/$pattern-$cmd.d", $cmd, $args, $abort);
_try_exec_dir("$dir/$pattern.d", $cmd, $args, $abort);
}
}
##################################################################
# syntactic scanning / parsing
my $match_comment = qr'#[^\n]*|//\h[^\n]*|/\*(?:[^*]|\*[^/])*\*/|\\\n\s*';
my $match_nobrace = qr'(?:[^{}\\]|\\.)*'s;
my $match_inner = $match_nobrace;
my $match_brace = qr"\{$match_inner\}"s;
for (my $i = 0; $i < 20; $i++) {
$match_inner = qr"$match_nobrace(?:$match_brace$match_nobrace)*"s;
$match_brace = qr"\{$match_inner\}"s;
}
my $match_fn_head = qr"\%([^\\\s{}()\[\]]*)(?=\{)"s;
my $match_fn = qr"$match_fn_head(?:\{($match_inner)\})"s;
# keep this in sync with mars.h definition of MREF_FLAGS
my %STATUS_FLAGS =
(
"UPTODATE" => 0x00000001,
"READING" => 0x00000002,
"WRITING" => 0x00000004,
"WRITE" => 0x00000008,
"MAY_WRITE" => 0x00000010,
"SKIP_SYNC" => 0x00000020,
"NODATA" => 0x00000040,
);
my %FEATURES_FLAGS =
(
"CHKSUM_MD5_OLD" => 0x00010000,
"CHKSUM_MD5" => 0x00020000,
"CHKSUM_CRC32C" => 0x00040000,
"CHKSUM_CRC32" => 0x00080000,
"CHKSUM_SHA1" => 0x00100000,
"COMPRESS_LZO" => 0x01000000,
"COMPRESS_LZ4" => 0x02000000,
"COMPRESS_ZLIB" => 0x04000000,
);
my %FLAGS_FEATURES;
my $chksum_features = 0x0;
my $compress_features = 0x0;
sub featuresflags2txt {
my ($flags, $skip_unknown) = @_;
$flags = 0x0 unless (defined($flags) && $flags ne "");
$flags = hex($flags) if $flags =~ m/^0x/;
my $txt = "";
for (my $mask = 0x1; $mask < (1 << 32); $mask <<= 1) {
my $bit = $flags & $mask;
next unless $bit;
if (defined($FLAGS_FEATURES{$bit})) {
$txt .= "|" if $txt;
$txt .= $FLAGS_FEATURES{$bit};
} elsif (!defined($skip_unknown)) {
$txt .= "|" if $txt;
$txt .= sprintf("0x%x", $bit);
}
}
return $txt;
}
sub txt2featuresflags {
my ($cmd, $txt) = @_;
my $flags = 0x0;
foreach my $this_txt (split(/\s*[|]\s*/, $txt)) {
chomp $this_txt;
$this_txt =~ s/^\s+//;
ldie "Digest names must start with 'CHKSUM_'" if ($cmd =~ m/digest/ && $this_txt !~ m/^CHKSUM/);
if (defined($FEATURES_FLAGS{$this_txt})) {
$flags |= $FEATURES_FLAGS{$this_txt};
} elsif ($this_txt =~ m/^0x[0-9a-f]+/) {
$flags |= hex($this_txt);
} elsif ($this_txt =~ m/^0[0-9]+/) {
$flags |= oct($this_txt);
} elsif ($this_txt =~ m/^[0-9]+$/) {
$flags |= $this_txt;
} else {
ldie "Unknown symbolic feature flag '$this_txt'\n";
}
}
return sprintf("0x%08x", $flags);
}
##################################################################
# Resource lists and their peers
# 2-dimensional caches for cartesian product
my %total_resources;
my %member_resources;
my %guest_resources;
my %any_resources;
my %total_peers;
my %member_peers;
my %guest_peers;
my %any_peers;
my $cache_dir = "$mars/cache-$real_host";
sub __read_cache {
my ($filename, $hash) = @_;
open(my $fh, "<", $filename) or return 0;
while (my $line = <$fh>) {
chomp $line;
my ($key1, $key2, $val) = split(" ", $line, 3);
$$hash{$key1}{$key2} = $val;
}
close($fh);
lprint "READ CACHE $filename\n" if $verbose >= 2;
return 1;
}
sub _read_cache {
return 0 if $kernel_strategy_version < 4;
my $inval_path = "$cache_dir/invalid";
if (-l $inval_path) {
_reset_resources();
return 0;
}
my $ok =
__read_cache("$cache_dir/total_resources.cache", \%total_resources) &&
__read_cache("$cache_dir/member_resources.cache", \%member_resources) &&
__read_cache("$cache_dir/guest_resources.cache", \%guest_resources) &&
__read_cache("$cache_dir/any_resources.cache", \%any_resources) &&
__read_cache("$cache_dir/total_peers.cache", \%total_peers) &&
__read_cache("$cache_dir/member_peers.cache", \%member_peers) &&
__read_cache("$cache_dir/guest_peers.cache", \%guest_peers) &&
__read_cache("$cache_dir/any_peers.cache", \%any_peers);
if (!$ok) {
_reset_resources();
}
return $ok;
}
sub __write_cache {
my ($filename, $hash) = @_;
my $tmpname = "$filename.tmp.$$";
local $OFS = " ";
local $ORS = "\n";
open(my $fh, ">", $tmpname) or return 0;
foreach my $key1 (keys(%$hash)) {
my $sub_hash = $$hash{$key1};
foreach my $key2 (keys(%$sub_hash)) {
my $val = $$sub_hash{$key2};
if (!print $fh ($key1, $key2, $val)) {
unlink($tmpname);
return 0;
}
}
}
if (!close($fh)) {
unlink($tmpname);
return 0;
}
rename($tmpname, $filename);
return 1;
}
sub _write_cache {
mkdir($cache_dir);
my $ok =
__write_cache("$cache_dir/total_resources.cache", \%total_resources) &&
__write_cache("$cache_dir/member_resources.cache", \%member_resources) &&
__write_cache("$cache_dir/guest_resources.cache", \%guest_resources) &&
__write_cache("$cache_dir/any_resources.cache", \%any_resources) &&
__write_cache("$cache_dir/total_peers.cache", \%total_peers) &&
__write_cache("$cache_dir/member_peers.cache", \%member_peers) &&
__write_cache("$cache_dir/guest_peers.cache", \%guest_peers) &&
__write_cache("$cache_dir/any_peers.cache", \%any_peers);
if (!$ok) {
_reset_resources();
}
return $ok;
}
sub _scan_caches {
return if %total_peers;
return if _read_cache();
# Reset all 2-dimensional hashes
%total_resources = ();
%member_resources = ();
%guest_resources = ();
%any_resources = ();
%total_peers = ();
%member_peers = ();
%guest_peers = ();
%any_peers = ();
my $ips_glob = "$mars/ips/ip-*";
my $resource_glob = "$mars/resource-*";
my $active_glob = "$mars/resource-*/{device,data,replay}-*";
my $probe_dir = "$mars/probe-$real_host";
if (-d $probe_dir) {
$ips_glob .= " $probe_dir$ips_glob";
$resource_glob .= " $probe_dir$resource_glob";
$active_glob .= " $probe_dir$active_glob";
}
# Add all known hosts to %total_peers but _not_ to %any_peers.
# Reason: some hosts might not be member/guest of any resource
foreach my $path (lamport_glob($ips_glob)) {
$path =~ m:/ip-(.*):;
my $this_peer = $1;
$total_peers{$this_peer} = {};
}
# Add all known resources to %total_resources but _not_ to %any_resources.
# Reason: some resources might exist but have no members / guests.
foreach my $path (lamport_glob($resource_glob)) {
$path =~ m:/resource-(.*):;
my $this_res = $1;
$total_resources{$this_res} = {};
}
# Now we look at all relevant combinations between resources and hosts
my @total_paths = lamport_glob($active_glob);
my %paths;
foreach my $path (@total_paths) {
# %paths hash: make $probe_dir variants globally unique
$path =~ s/^$probe_dir//;
$paths{$path} = 1;
}
foreach my $path (@total_paths) {
next unless $path =~ m:/resource-([^/]+?)/[a-z]+-(.+):;
my $this_res = $1;
my $this_peer = $2;
# dynamic programming
next if $total_resources{$this_res}{$this_peer};
# remember result combinations
$total_resources{$this_res}{$this_peer} = 1;
$total_peers{$this_peer}{$this_res} = 1;
my $is_any = $paths{"$mars/resource-$this_res/device-$this_peer"};
if ($is_any) {
$any_resources{$this_res}{$this_peer} = 1;
$any_peers{$this_peer}{$this_res} = 1;
}
my $is_member =
$paths{"$mars/resource-$this_res/data-$this_peer"} ||
$paths{"$mars/resource-$this_res/replay-$this_peer"};
if ($is_member) {
$member_resources{$this_res}{$this_peer} = 1;
$member_peers{$this_peer}{$this_res} = 1;
next;
}
my $is_guest =
$is_any &&
get_link("$mars/resource-$this_res/actual-$this_peer/prosumer-on", 2) ||
get_link("$mars/resource-$this_res/todo-$this_peer/exports", 2) =~ m:(^|\+)$this_peer($|\+):;
if ($is_guest) {
$guest_resources{$this_res}{$this_peer} = 1;
$guest_peers{$this_peer}{$this_res} = 1;
next;
}
# Notice: _candidates_ for guests are over here.
# They can be determined by set_minus(%any_peers,%member_peers)
}
if ($verbose) {
lprint "====== found " .
scalar(keys(%total_peers)) . " total and " .
scalar(keys(%member_peers)) . " participating and " .
scalar(keys(%guest_peers)) . " guest " .
"peers\n";
lprint "====== found " .
scalar(keys(%total_resources)) . " total and " .
scalar(keys(%member_resources)) . " participating and " .
scalar(keys(%guest_resources)) . " guest " .
"resources\n";
}
if (!_write_cache()) {
lwarn "cannot write peer cache\n";
}
}
sub _reset_resources {
system("rm -rf $cache_dir/*.cache");
%total_peers = ();
}
sub is_member {
my ($res, $peer) = @_;
_scan_caches() unless %total_peers;
return $member_resources{$res}{$peer};
}
sub is_guest {
my ($res, $peer) = @_;
_scan_caches() unless %total_peers;
return $guest_resources{$res}{$peer};
}
sub is_any {
my ($res, $peer) = @_;
_scan_caches() unless %total_peers;
return $any_resources{$res}{$peer};
}
sub alphanum_cmp {
my ($aa, $bb) = ($a, $b);
$aa =~ s/([0-9]+)/sprintf("%012d",$1)/eg;
$bb =~ s/([0-9]+)/sprintf("%012d",$1)/eg;
return $aa cmp $bb;
}
sub reverse_cmp {
my ($aa, $bb) = ($b, $a);
$aa =~ s/([0-9]+)/sprintf("%012d",$1)/eg;
$bb =~ s/([0-9]+)/sprintf("%012d",$1)/eg;
return $aa cmp $bb;
}
sub get_total_resources {
my $peer = shift;
_scan_caches() unless %total_peers;
if ($peer) {
my $projection = $total_peers{$peer};
return sort alphanum_cmp keys(%$projection);
} else {
return sort alphanum_cmp keys(%total_resources);
}
}
sub get_member_resources {
my $peer = shift;
_scan_caches() unless %total_peers;
if ($peer) {
my $projection = $member_peers{$peer};
return sort alphanum_cmp keys(%$projection);
} else {
return sort alphanum_cmp keys(%member_resources);
}
}
sub get_guest_resources {
my $peer = shift;
_scan_caches() unless %total_peers;
if ($peer) {
my $projection = $guest_peers{$peer};
return sort alphanum_cmp keys(%$projection);
} else {
return sort alphanum_cmp keys(%guest_resources);
}
}
sub get_any_resources {
my $peer = shift;
_scan_caches() unless %total_peers;
if ($peer) {
my $projection = $any_peers{$peer};
return sort alphanum_cmp keys(%$projection);
} else {
return sort alphanum_cmp keys(%any_resources);
}
}
sub get_total_peers {
my $res = shift;
_scan_caches() unless %total_peers;
if ($res) {
my $projection = $total_resources{$res};
return sort alphanum_cmp keys(%$projection);
} else {
return sort alphanum_cmp keys(%total_peers);
}
}
sub get_member_peers {
my $res = shift;
_scan_caches() unless %total_peers;
if ($res) {
my $projection = $member_resources{$res};
return sort alphanum_cmp keys(%$projection);
} else {
return sort alphanum_cmp keys(%member_peers);
}
}
sub get_guest_peers {
my $res = shift;
_scan_caches() unless %total_peers;
if ($res) {
my $projection = $guest_resources{$res};
return sort alphanum_cmp keys(%$projection);
} else {
return sort alphanum_cmp keys(%guest_peers);
}
}
sub get_any_peers {
my $res = shift;
_scan_caches() unless %total_peers;
if ($res) {
my $projection = $any_resources{$res};
return sort alphanum_cmp keys(%$projection);
} else {
return sort alphanum_cmp keys(%any_peers);
}
}
sub key_intersect {
my ($hash1, $hash2) = @_;
my %h1 = %$hash1;
my %h2 = %$hash2;
my %result = %h1;
foreach my $key (keys(%h2)) {
delete $result{$key};
}
return %result;
}
##################################################################
# dynamic systemd control
my $systemd_subdir = defined($ENV{SYSTEMD_SUBDIR}) ? $ENV{SYSTEMD_SUBDIR} : "systemd-templates";
my $systemd_system_dirs = defined($ENV{SYSTEMD_SYSTEM_DIRS}) ?
$ENV{SYSTEMD_SYSTEM_DIRS} :
"/etc/systemd/system,/run/systemd/system,/usr/lib/systemd/system";
my $systemd_target_dir = defined($ENV{SYSTEMD_TARGET_DIR}) ? $ENV{SYSTEMD_TARGET_DIR} : "/run/systemd/system";
my $systemd_var_dir = defined($ENV{SYSTEMD_VAR_DIR}) ?
$ENV{SYSTEMD_VAR_DIR} :
"$marsadm_var_dir/systemd";
my $systemd_suffixes = defined($ENV{SYSTEMD_SUFFIXES}) ?
$ENV{SYSTEMD_SUFFIXES} :
"service,socket,device,mount,automount,swap,target,path,timer,slice,scope";
my $systemctl = defined($ENV{SYSTEMCTL}) ? $ENV{SYSTEMCTL} : "systemctl";
my $systemd_escape = defined($ENV{SYSTEMD_ESCAPE}) ? $ENV{SYSTEMD_ESCAPE} : "@";
my $systemd_incape = defined($ENV{SYSTEMD_INCAPE}) ? $ENV{SYSTEMD_INCAPE} : "\\^";
my $systemd_dependencies = defined($ENV{SYSTEMD_DEPENDENCIES}) ?
$ENV{SYSTEMD_DEPENDENCIES} :
"Requires|Requisite|Wants|BindsTo|PartOf|Conflicts|Before|After|OnFailure|PropagatesReloadTo|ReloadPropagatedFrom|JoinsNamespaceOf|RequiresMountsFor|Alias|WantedBy|RequiredBy|Also|DefaultInstance";
my $systemd_lock_file = defined($ENV{SYSTEMD_LOCK_FILE}) ? $ENV{SYSTEMD_LOCK_FILE} : "/tmp/systemd.lock";
my @systemctl_start =
(
"mars-trigger.path", # This MUST come first
"mars-emergency.path",
);
my @systemctl_enable =
(
@systemctl_start,
"mars-trigger.service",
"mars-emergency.service",
);
my %recursive_locks;
sub systemd_lock {
my ($suffix, $try_lock) = @_;
my $lock_file = $systemd_lock_file;
$lock_file .= "." . $suffix if defined($suffix) && $suffix;
my $lock_status = $recursive_locks{$lock_file}++;
if ($lock_status) {
return 0;
}
lprint "TRYING '$lock_file'\n" if $verbose > 1;
use IO::Handle;
use Fcntl;
my $max_time = $timeout > 0 ? $timeout : 30;
my $count = 0;
my $retry = 0;
my $fh;
for (;;) {
my $test_pid;
if (open(my $IN, "<", $lock_file)) {
$test_pid = <$IN>;
chomp $test_pid;
my ($dev,$ino,$mode,$nlink,$uid,$gid,$rdev,$size,
$atime,$mtime,$ctime,$blksize,$blocks) = stat($IN);
close($IN);
# Check for timeout
if ($count > $max_time ||
(defined($mtime) && $mtime && $mtime + $max_time < time())) {
lwarn "breaking lock $lock_file after $max_time seconds\n";
unlink($lock_file);
$count = 0;
}
}
$fh = undef;
my $status = sysopen($fh, $lock_file, O_CREAT|O_EXCL|O_TRUNC|O_WRONLY);
last if defined($status) && $status;
# Check whether pid exists
if (defined($test_pid) && $test_pid && ! -d "/proc/$test_pid") {
next if !$retry++;
lwarn "breaking lock $lock_file, pid $test_pid is no longer alive.\n";
unlink($lock_file);
$count = 0;
$retry = 0;
next;
}
if (defined($try_lock) && $try_lock && !$force) {
lprint "FAILED '$lock_file'\n" if $verbose > 1;
return 1;
}
$count++;
sleep(1);
}
print $fh "$$\n";
close($fh);
lprint "LOCK '$lock_file'\n" if $verbose > 1;
return 0;
}
sub systemd_unlock {
my ($suffix) = @_;
my $lock_file = $systemd_lock_file;
$lock_file .= "." . $suffix if defined($suffix) && $suffix;
if (--$recursive_locks{$lock_file} > 0) {
return;
}
unlink($lock_file);
lprint "UNLOCK '$lock_file'\n" if $verbose > 1;
}
my %template_names;
my %template_files;
sub get_template_files {
if (!%template_files) {
foreach my $dir (@MARS_PATH) {
my $subdir = "$dir/$systemd_subdir";
$subdir = $dir unless -d $subdir;
next unless -d $subdir;
foreach my $template_file (lamport_glob("$subdir/*.{$systemd_suffixes}")) {
my $template_name = `basename '$template_file'`;
chomp $template_name;
next unless $template_name;
# Only the first hit will win when the same template is in multiple dirs.
next if defined($template_files{$template_name});
$template_names{$template_file} = $template_name;
$template_files{$template_name} = $template_file;
lprint "== found template '$template_name' at '$template_file'\n" if $verbose;
}
}
}
return sort alphanum_cmp keys(%template_names);
}
sub get_instance_files {
my ($dir) = @_;
my $glob = "$dir/*.{$systemd_suffixes}";
my %instance_files;
foreach my $instance_file (lamport_glob($glob)) {
my $instance_name = `basename '$instance_file'`;
chomp $instance_name;
$instance_files{$instance_name} = $instance_file;
}
return %instance_files;
}
my %systemd_names;
my %systemd_files;
sub get_systemd_files {
if (!%systemd_names) {
foreach my $systemd_file (lamport_glob("{$systemd_system_dirs}/*.{$systemd_suffixes}")) {
next if $systemd_file =~ m:$systemd_target_dir:;
my $systemd_name = `basename '$systemd_file'`;
chomp $systemd_name;
$systemd_names{$systemd_file} = $systemd_name;
$systemd_files{$systemd_name} = $systemd_file;
}
}
return sort alphanum_cmp keys(%systemd_names);
}
sub systemctl {
my ($args) = @_;
my $cmd = "$systemctl $args";
systemd_lock();
lprint "executing: '$cmd'\n" if $verbose > 1;
my $status;
eval {
$status = system($cmd);
};
systemd_unlock();
return $status;
}
my %failed;
sub _systemd_escape {
my ($txt) = @_;
my $replac = `systemd-escape --path "$txt"`;
chomp $replac;
return $replac;
}
sub subst_systemd_vars {
my ($env, $text, $do_extra_escape) = @_;
my $parsed = "";
while ($text =~ m/[$systemd_escape]([A-Za-z_][-A-Za-z0-9_]*)?[{]($match_inner)[}]/ps) {
my $name = $1 || "";
my $body = $2;
$parsed .= $PREMATCH;
my $rest = $POSTMATCH;
my $this_escape = 0;
my $replac;
$_ = $name;
PRE_SWITCH: {
if (/^escvar$/) {
$name = "";
$this_escape = 1;
last PRE_SWITCH;
}
if (/^esc$/) {
$name = "verbatim";
$this_escape = 1;
last PRE_SWITCH;
}
}
$_ = $name;
SWITCH: {
if (/^eval$/) {
$replac = parse_macro($body, $env);
last SWITCH;
}
if (/^$/) {
my $varname = parse_macro($body, $env);
$replac = $$env{$varname};
if (!defined($replac)) {
lwarn "variable '$varname' is undefined\n" unless defined($failed{$varname});
$failed{$varname} = 1;
$replac = "UNDEFINED($varname)";
}
lprint " subst '$systemd_escape\{$varname\}' => '$replac'\n" if $verbose > 2;
last SWITCH;
}
if (/^verbatim$/) {
$replac = $body;
last SWITCH;
}
lwarn "systemd function '$name' is undefined\n";
$replac = $body;
}
if ($do_extra_escape || $this_escape) {
my $orig = $replac;
$replac = _systemd_escape($replac);
lprint " escape '$orig' => '$replac'\n" if $verbose > 2;
}
$parsed .= $replac;
$text = $rest;
}
return ($env, $parsed . $text);
}
sub match_systemd_vars {
my ($env, $pattern, $text) = @_;
($env, $pattern) = subst_systemd_vars($env, $pattern, 1);
($env, $text) = subst_systemd_vars($env, $text, 1);
my @names;
my $regex = "";
while ($pattern =~ m/[$systemd_incape][{]([A-Za-z_][A-Za-z0-9_]*)[}]/ps) {
my $name = $1;
my $pre = $PREMATCH;
my $post = $POSTMATCH;
push @names, $name;
$regex .= $pre . "(.+)";
$pattern = $post;
}
$regex .= $pattern;
$regex =~ s/\\/\\\\/g;
my $nr = 1;
if ($text =~ m/^$regex$/p) {
foreach my $name (@names) {
my $val;
eval "\$val = \$$nr;";
$$env{$name} = $val;
$nr++;
}
return ($env, $text);
}
return (undef, $text);
}
my %referenced_units;
sub _instantiate_systemd_unit {
my ($env, $template_file, $subst) = @_;
($env, my $replac) = subst_systemd_vars($env, $subst, 1);
my $outfile = "$systemd_var_dir.new/$replac";
chomp $outfile;
lprint "==== Translate systemd template '$template_file' => '$outfile'\n" if $verbose;
my $text = "";
{
local $/; # slurp
if (!open(IN, "< $template_file")) {
lwarn "cannot open system template file '$template_file'\n";
return (0, $outfile);
}
$text = <IN>;
close(IN);
}
($env, $text) = subst_systemd_vars($env, $text, 0);
my $scan = $text;
while ($scan =~ m/^\s*($systemd_dependencies)\s*=\s*(.*?)$/mp) {
my $next_unit_list = $2;
$scan = $POSTMATCH;
while ($next_unit_list =~ m/[^\s]+/) {
my $next_unit = $MATCH;
$next_unit_list = $POSTMATCH;
# some units like mount units may be specified as paths.
$next_unit = _systemd_escape($next_unit) if $next_unit =~ m:/:;
lprint "-- '$template_file' found reference to '$next_unit'\n" if $verbose > 2;
# Remember the encountered name
$referenced_units{$next_unit} = 1;
}
}
if (open(IN, "< $outfile")) {
# Check whether something has changed
local $/; # slurp
my $old = <IN>;
close(IN);
if (defined($old) && $old eq $text) {
lprint "== systemd unit '$outfile' has not changed\n" if $verbose;
return (0, $outfile);
}
}
if (!open(OUT, "> $outfile.tmp")) {
lwarn "cannot create '$outfile'\n";
return (0, $outfile);
}
print OUT $text;
close(OUT);
rename("$outfile.tmp", $outfile);
return (1, $outfile, $replac);
}
sub make_systemd_unit {
my ($cmd, $res, $target) = @_;
my @res_list;
if (defined($res)) {
@res_list = ($res);
} else {
@res_list = get_member_resources($host);
}
my ($found_env, $found_template_file, $found_subst);
search:
foreach my $template_file (get_template_files()) {
my $template_name = $template_names{$template_file};
next unless $template_name;
foreach my $res (@res_list) {
($template_name, my $env) = make_env($cmd, $res, $template_name);
my $subst = $template_name;
(my $new_env, $subst) = match_systemd_vars($env, $template_name, $target);
if ($new_env) {
($found_env, $found_template_file, $found_subst) = ($new_env, $template_file, $subst);
last search;
} elsif ($subst) {
# Check if already installed somewhere else
get_systemd_files();
if (defined($systemd_files{$subst})) {
lprint "systemd unit '$subst' is already present at '$systemd_files{$subst}'\n" if $verbose;
return 0;
}
}
}
}
if (!$found_template_file) {
lwarn "cannot find any systemd template for target unit '$target'\n";
return 0;
}
my ($nr, $file, $name) = _instantiate_systemd_unit($found_env, $found_template_file, $found_subst);
if ($nr) {
$systemd_names{$file} = $name;
$systemd_files{$name} = $file;
}
return $nr;
}
sub set_systemd_want {
my ($cmd, $res, $new) = @_;
if ($new ne "(none)") {
my $host_path = "$mars/ips/ip-$new";
unless (get_link($host_path, 1)) {
lwarn "The hostname '$new' does not exist.\n";
ldie "Refusing to set systemd-want.\n" unless $force;
}
}
my $unit_glob = "$mars/resource-$res/systemd-*-unit";
unless (lamport_glob($unit_glob)) {
lwarn "Resource '$res' has no configured systemd units.\n";
lwarn "First configure the resource with marsadm set-systemd-unit.\n";
ldie "Refusing to set systemd-want.\n" unless $force;
}
my $want_path = "$mars/resource-$res/systemd-want";
set_link($new, $want_path);
finish_links();
_systemd_trigger();
}
sub systemd_present {
my ($cmd, $res) = @_;
my $unit_glob = "$mars/resource-$res/systemd-*-unit";
return "" unless lamport_glob($unit_glob);
my $want_path = "$mars/resource-$res/systemd-want";
my $want = get_link($want_path, 2);
return $want;
}
sub get_systemd_want {
my $txt = systemd_present(@_);
lprint "$txt\n";
}
sub systemd_exists {
my ($unit_list) = @_;
foreach my $unit (split(/ +/, $unit_list)) {
my $check_cmd = "list-unit-files \"$unit\" | wc -l";
my $count = `$systemctl $check_cmd`;
if (!defined($count) || !$count || $count <= 0) {
lprint "nothing to do for systemd, unit file '$unit' does not exist.\n";
return 0;
}
}
return 1;
}
sub systemd_enabled {
my ($unit_list) = @_;
foreach my $unit (split(/ +/, $unit_list)) {
my $check_cmd = "is-enabled '$unit' > /dev/null 2>&1";
my $status = systemctl($check_cmd);
if ($status) {
lprint "systemd unit '$unit' is not existing or not enabled.\n";
return $status;
}
}
return 0;
}
sub _systemd_op {
my ($op, $unit) = @_;
if (systemctl("cat '$unit' > /dev/null 2>&1")) {
lwarn "systemd unit $unit does not exist.\n";
return 0;
}
my $ctl_cmd = "is-failed --quiet '$unit'";
my $ok = systemctl($ctl_cmd);
if (!$ok) {
my $ctl_cmd = "reset-failed '$unit'";
my $status = systemctl($ctl_cmd);
lprint "--- resetting failed unit '$unit': status=$status\n";
}
if ($op eq "start" || $op eq "restart") {
if (systemd_enabled($unit)) {
return 0;
}
}
$ctl_cmd = "$op '$unit'";
lprint "--- running systemd command: $ctl_cmd\n";
my $status = systemctl($ctl_cmd);
if ($status) {
lwarn "command '$systemctl $ctl_cmd' failed, status=$status\n";
} else {
lprint "--- systemctl status=$status\n";
}
return $status;
}
sub systemd_activate {
my ($cmd, $res, $override, $fail_abort) = @_;
my $want_path = "$mars/resource-$res/systemd-want";
my $want = get_link($want_path, 2);
my $want_host_path = "$mars/resource-$res/userspace/systemd-want-$host";
if (!$want) {
lprint "Nothing to (de)activate: $want_path does not exist\n" if $verbose;
unlink($want_host_path);
return 0;
}
my $do_activate = $want eq $host;
if ($do_activate) {
# Check attach switch
my $path = "$mars/resource-$res/todo-$host/attach";
if (!get_link($path, 1)) {
$do_activate = 0;
}
}
if ($do_activate) {
my $primary = _get_designated_primary($res);
if ($primary ne $host) {
# Do not activate for now
lprint "Overriding systemd-want: '$host' is not designated primary,\n";
lprint "==== Do not modify resource '$res'\n" if $verbose;
return 0;
}
}
if (defined($override) && $override != $do_activate) {
lprint "Overriding unit activate=$do_activate with $override\n" if $verbose;
$do_activate = $override;
}
if ($do_activate && !device_exists($res)) {
my $dev = device_name($res);
lprint "Device $dev not present, cannot activate systemd unit\n" if $verbose;
$do_activate = 0;
}
my $oper = $do_activate ? "start" : "stop";
my $unit_path = "$mars/resource-$res/systemd-$oper-unit";
my $unit = get_link($unit_path, 2);
if (!$unit) {
lprint "Nothing to (de)activate: $unit_path does not exist\n" if $verbose;
unlink($want_host_path);
return 0;
}
my $ctl_cmd = "show \"$unit\"";
my $op = "show";
if ($do_activate) {
safe_touch($want_host_path) || safe_creat($want_host_path);
$unit =~ s/ .*//;
if (systemd_enabled($unit)) {
return 0;
}
lprint "==== Activate resource '$res' unit '$unit'\n"if $verbose;
$op = "start";
} else {
unlink($want_host_path);
$unit =~ s/.* //;
lprint "==== Deactivate resource '$res' unit '$unit'\n"if $verbose;
$op = "stop";
}
my $status = _systemd_op($op, $unit);
my $response_path = "$mars/resource-$res/userspace/systemd-status-$oper-$host";
set_link($status, $response_path);
finish_links();
if ($status && defined($fail_abort) && $fail_abort) {
lwarn "systemd unit '$unit' operation $op failed, status=$status\n";
return 0;
}
return 1;
}
sub __systemd_commit {
my %changes;
my %act_files = get_instance_files($systemd_target_dir);
my %old_files = get_instance_files($systemd_var_dir);
my %new_files = get_instance_files("$systemd_var_dir.new");
foreach my $old_target (sort alphanum_cmp keys(%old_files)) {
next if defined($new_files{$old_target});
next if !defined($act_files{$old_target});
lprint "-- marking '$old_target' for removal\n" if $verbose > 2;
$changes{$old_target} = -1;
}
system("rm -rf \"$systemd_var_dir.old\"");
system("mv \"$systemd_var_dir\" \"$systemd_var_dir.old\"");
system("mv \"$systemd_var_dir.new\" \"$systemd_var_dir\"");
if (system("cp -a $systemd_var_dir/* \"$systemd_target_dir\"")) {
lwarn "Cannot copy new unit instances from '$systemd_var_dir' to '$systemd_target_dir'\n";
return ();
}
foreach my $new_target (sort alphanum_cmp keys(%new_files)) {
next if defined($old_files{$new_target});
lprint "-- enabling new '$new_target'\n" if $verbose > 2;
my $unit = `basename "$new_target"`;
chomp $unit;
_systemd_op("enable", $unit);
}
return %changes;
}
sub __systemd_generate_all {
my ($cmd) = @_;
return unless -d $mars;
return unless -d $systemd_target_dir;
system("rm -rf \"$systemd_var_dir.new\"");
system("mkdir -p \"$systemd_var_dir.new\"");
system("mkdir -p \"$systemd_var_dir\"");
return unless -d $systemd_var_dir;
return unless -d "$systemd_var_dir.new";
# Determine all template files.
get_template_files();
# Always add all plain templates
%referenced_units = ();
my $count = 0;
foreach my $template_name (sort alphanum_cmp keys(%template_files)) {
next if $template_name =~ m/($systemd_incape|$systemd_escape)/;
$count += make_systemd_unit($cmd, "UNDEFINED_RESOURCE", $template_name);
}
# Determine all participating resource names.
my @res_list = get_member_resources($host);
# Create initial systemd units
foreach my $res (@res_list) {
foreach my $unit_link (lamport_glob("$mars/resource-$res/systemd-*-unit")) {
my $target = get_link($unit_link);
$count += make_systemd_unit($cmd, $res, $target);
}
}
# Compute the transitive closure of referenced units
lprint "== adding transitive units for $count start units.\n" if $verbose;
my %done_units;
for (;;) {
my $old_count = $count;
foreach my $target (sort alphanum_cmp keys(%referenced_units)) {
next if $done_units{$target};
$count += make_systemd_unit($cmd, undef, $target);
$done_units{$target} = 1;
}
last if ($count <= $old_count);
}
lprint "== $count units have changed.\n" if $verbose;
# Check and commit the new situation
my %changes = __systemd_commit();
return %changes;
}
sub __systemd_commit_ops {
my $cmd = shift;
my %changes = @_;
my $deleted = 0;
foreach my $target (sort alphanum_cmp keys(%changes)) {
my $action = $changes{$target};
if ($action < 0) {
lprint "Removing old template instance '$target'\n" if $verbose;
_systemd_op("stop", $target);
system("rm -f \"$systemd_target_dir/$target\"");
$deleted++;
}
}
lprint "== $deleted units have been removed.\n" if $verbose;
lprint "==== Restart systemd\n"if $verbose;
foreach my $unit (@systemctl_enable) {
_systemd_op("enable", $unit);
}
systemctl("daemon-reload");
# Activate all *.path triggers
for my $unit_path (lamport_glob("$systemd_target_dir/*mars*.path")) {
my $unit = `basename "$unit_path"`;
chomp $unit;
lprint "==== Activate path watcher '$unit'\n"if $verbose;
_systemd_op("start", $unit);
}
my $varfile = "$marsadm_var_dir/systemd.status";
system("mv $varfile.tmp $varfile");
}
sub __systemd_activate_ops {
my $cmd = shift;
# Activate the listed units.
my @res_list = get_member_resources($host);
foreach my $res (@res_list) {
systemd_activate($cmd, $res);
}
# Start standard units
foreach my $unit (@systemctl_start) {
_systemd_op("start", $unit);
}
}
sub __systemd_fingerprint {
my $text = "";
# Fingerprint all source templates
get_template_files();
foreach my $template_file (sort alphanum_cmp keys(%template_names)) {
my ($dev,$ino,$mode,$nlink,$uid,$gid,$rdev,$size,
$atime,$mtime,$ctime,$blksize,$blocks) = stat($template_file);
$text .= "$size,$mtime,$template_file\n";
}
# Fingerprint all resources
$text .= "#\n";
my @res_list = get_member_resources($host);
foreach my $res (@res_list) {
$text .= "$res\n";
my $unit_glob = "$mars/resource-$res/systemd-*-unit";
foreach my $unit_link (lamport_glob($unit_glob)) {
$text .= get_link($unit_link, 1) . "\n";
}
}
# Fingerprint resulting templates (protect against external modifications)
$text .= "#\n";
foreach my $unit_file (lamport_glob($systemd_target_dir)) {
next unless -r $unit_file;
my ($dev,$ino,$mode,$nlink,$uid,$gid,$rdev,$size,
$atime,$mtime,$ctime,$blksize,$blocks) = stat($unit_file);
$text .= "$size,$mtime,$unit_file\n";
}
return $text;
}
sub is_systemd_generate_necessary {
my $cmd = shift;
# do not generate after forking
return 0 if $child_prefix;
my $text = __systemd_fingerprint();
system("mkdir -p $marsadm_var_dir");
my $varfile = "$marsadm_var_dir/systemd.status";
my $old_text = "";
local $/; # slurp
if (open(IN, "< $varfile")) {
$old_text = <IN>;
close(IN);
} else {
lwarn "file '$varfile' does not yet exist\n";
}
if ($text eq $old_text) {
lprint "systemd template generation not necessary.\n" if $verbose;
return 0 unless ($force && $cmd ne "primary");
lprint "Forcing template generation...\n";
}
if (!open(OUT, "> $varfile.tmp")) {
lwarn "cannot open '$varfile.tmp'\n";
return 1;
}
print OUT $text;
close(OUT);
return 1;
}
sub __systemd_trigger {
my $cmd = shift;
# ensure that trigger file exists
mkdir("$mars/userspace") unless -d "$mars/userspace";
my $trigger = "$mars/userspace/systemd-trigger";
lprint "Triggering '$trigger' for '$cmd'\n" if $verbose;
safe_touch($trigger) || safe_creat($trigger);
}
sub _systemd_trigger {
my ($cmd) = @_;
my $needed_unit = $systemctl_start[0];
if (!systemd_exists($needed_unit)) {
return;
}
if (!systemctl("cat '$needed_unit' > /dev/null 2>&1")) {
if (systemctl("status '$needed_unit' > /dev/null 2>&1")) {
systemctl("enable '$needed_unit'");
systemctl("start '$needed_unit'");
}
}
if (systemd_enabled($needed_unit)) {
return;
}
systemd_lock();
if (is_systemd_generate_necessary($cmd)) {
__systemd_activate_ops($cmd);
lprint "Direct template generation\n" if $verbose;
my %changes;
# Continue with unlock in case of any deaths inbetween
eval {
%changes = __systemd_generate_all($cmd);
};
__systemd_commit_ops($cmd, %changes);
}
__systemd_activate_ops($cmd);
systemd_unlock();
}
sub systemd_trigger {
my ($cmd) = @_;
return unless -d $systemd_target_dir;
systemd_lock();
if (is_systemd_generate_necessary($cmd)) {
my %changes;
# Continue with unlock in case of any deaths inbetween
eval {
%changes = __systemd_generate_all($cmd);
};
__systemd_commit_ops($cmd, %changes);
}
__systemd_activate_ops($cmd);
systemd_unlock();
}
sub _get_systemd_unit {
my ($cmd, $res) = @_;
my $unit_start_path = "$mars/resource-$res/systemd-start-unit";
my $unit_stop_path = "$mars/resource-$res/systemd-stop-unit";
my $start_unit = get_link($unit_start_path, 2);
my $stop_unit = get_link($unit_stop_path, 2);
if (!$start_unit) {
lprint_stderr "Nothing to show: start unit $unit_start_path does not exist\n" if $verbose;
return "";
}
if (!$stop_unit) {
lwarn "Strange: stop unit $unit_stop_path does not exist\n";
return $start_unit;
}
return "$start_unit $stop_unit";
}
sub get_systemd_unit {
my $unit = _get_systemd_unit(@_);
lprint "$unit\n";
}
sub set_systemd_unit {
my ($cmd, $res, $start_name, $stop_name) = @_;
ldie "Start unit name is undefined\n" unless defined($start_name);
$stop_name = $start_name unless defined($stop_name);
my $unit_start_path = "$mars/resource-$res/systemd-start-unit";
my $unit_stop_path = "$mars/resource-$res/systemd-stop-unit";
# Convenience: try to fix any non-escaped resource names
$start_name =~ m/($res)/p;
if ($1) {
my $pre = $PREMATCH;
my $post = $POSTMATCH;
my $replac = _systemd_escape($res);
$start_name = $pre . $replac . $post;
}
$stop_name =~ m/($res)/p;
if ($1) {
my $pre = $PREMATCH;
my $post = $POSTMATCH;
my $replac = _systemd_escape($res);
$stop_name = $pre . $replac . $post;
}
my $want_path = "$mars/resource-$res/systemd-want";
if ($start_name) {
set_link($start_name, $unit_start_path);
my $primary = _get_designated_primary($res, 1);
if ($primary) {
set_link($primary, $want_path);
}
} else {
_create_delete($unit_start_path);
_create_delete($want_path);
}
if ($stop_name) {
set_link($stop_name, $unit_stop_path);
} else {
_create_delete($unit_stop_path);
_create_delete($want_path);
}
finish_links();
systemd_trigger($cmd);
}
##################################################################
# path correction
sub correct_path {
my ($path) = @_;
# actual switches
$path =~ s:(/is-attach)[a-z]*$:$1ed:;
$path =~ s:(/is-)(fetch)[a-z]*$:$1copy:;
$path =~ s:(/is-)(apply)[a-z]*$:$1replay:;
$path =~ s:(/is-(copy|replay|sync))[a-z]*$:$1ing:;
$path =~ s:(/is-)(primary|secondary)[a-z]*$:$1primary:;
# todo switches
$path =~ s:(/fetch)[a-z]*$:/connect:;
$path =~ s:(/apply)[a-z]*$:/allow-replay:;
$path =~ s:(/replay)[a-z]*$:/allow-replay:;
$path =~ s:(/todo-.*/(primary|secondary))[a-z]*$:/primary:;
return $path;
}
##################################################################
# low-level infrastructure
my @link_list = ();
my %link_hash;
sub get_link {
my ($path, $unchecked) = @_;
my $result = readlink($path);
if (!defined($result)) {
ldie "cannot read symlink '$path'\n" unless $unchecked;
lwarn "cannot read symlink '$path'\n" if $unchecked == 1;
$result = "";
}
$result = "" if $result eq ".deleted";
return $result;
}
sub get_link_stamp {
my ($path) = @_;
my @stat = lstat($path);
return 0 if (!@stat);
return $stat[9];
}
sub is_recent {
my ($stamp, $wind) = @_;
return 0 unless ($stamp && $stamp =~ m/^\s*[0-9.]/);
$wind = $window * 2 unless $wind;
return 1 if $stamp + $wind >= mars_time();
return 0;
}
sub is_link_recent {
my ($path, $wind) = @_;
my @stat = lstat($path);
return 0 if (!@stat);
return is_recent($stat[9], $wind);
}
sub get_alive_link {
my ($name, $peer, $unchecked) = @_;
my $path = "$mars/actual-$peer/$name";
my $result;
if ($compat_alivelinks) {
$result = get_link($path, 2);
my $path_old = "$mars/$name-$peer";
my $result_old = get_link($path_old, 2);
return $result_old if !$result;
# determine the newer link
if (get_link_stamp($path_old) > get_link_stamp($path)) {
return $result_old if $result_old;
}
} else {
$result = get_link($path, $unchecked);
}
return $result;
}
sub get_alive_stamp {
my ($name, $peer) = @_;
my $path = "$mars/actual-$peer/$name";
my $result = get_link_stamp($path);
if ($compat_alivelinks) {
my $path_old = "$mars/$name-$peer";
my $result_old = get_link_stamp($path_old);
return $result_old if !$result;
# determine the newer link
if ($result_old > $result) {
return $result_old;
}
}
return $result;
}
sub alive_glob {
my ($name, $hosts) = @_;
$hosts = "*" unless $hosts;
my %peers;
foreach my $path (lamport_glob("$mars/actual-$hosts/$name")) {
next unless $path =~ m:/actual-(.+)/:;
my $peer = $1;
next unless $peer;
$peers{$peer} = 1;
}
if ($compat_alivelinks) {
foreach my $path (lamport_glob("$mars/$name-$hosts")) {
next unless $path =~ m:/$name-(.+):;
my $peer = $1;
next unless $peer;
$peers{$peer} = 1;
}
}
return sort alphanum_cmp keys(%peers);
}
sub to_tmp {
my $path = shift;
$path =~ s:^(.*)/:$1/.tmp.$$.:;
return $path;
}
sub set_link {
my ($src, $dst) = @_;
# safeguard trailing slashes
$dst =~ s:/$::;
my $dst_tmp = to_tmp($dst);
unlink($dst_tmp);
symlink($src, $dst_tmp) or ldie "cannot create symlink '$dst' -> '$src'\n";
# the _order_ is important! remove existing intermediate element before re-appanding
if (exists($link_hash{$dst})) {
my @copy = @link_list;
@link_list = ();
foreach my $elem (@copy) {
next if $elem eq $dst;
push @link_list, $elem;
}
}
$link_hash{$dst} = $src;
push @link_list, $dst;
}
sub finish_links {
my ($timestamp) = @_;
return unless @link_list;
$timestamp = mars_time() unless $timestamp;
lprint "using lamport timestamp $timestamp\n" if $verbose;
my $trigger_code = 1;
my $count = 0;
while (my $link = shift @link_list) {
my $link_tmp = to_tmp($link);
my $target = readlink($link_tmp);
next unless defined($target);
my $this_timestamp = $timestamp;
unless (system("touch -h -d \"\@$this_timestamp\" $link_tmp") == 0) {
lwarn "cannot set mtime on symlink '$link_tmp'\n";
}
if ($dry_run) {
lprint "DRY_RUN: would create symlink '$link' -> '$target'\n";
unlink($link_tmp);
next;
}
# allow replacement of directories with symlinks
rmdir($link) if -d $link;
unless (rename($link_tmp, $link)) {
lwarn "cannot finalize symlink '$link'\n";
} elsif ($verbose) {
lprint "created symlink '$link' -> '$target'\n";
}
$count++;
$trigger_code = 2 if $link =~ m:/(primary|todo-global|ip):;
}
_trigger($trigger_code) if $count > 0;
}
##################################################################
# global checks
sub get_global_versions {
unless (%FLAGS_FEATURES) {
foreach my $txt (keys(%FEATURES_FLAGS)) {
my $mask = $FEATURES_FLAGS{$txt};
$FLAGS_FEATURES{$mask} = $txt;
$chksum_features |= $mask if $txt =~ m/^CHKSUM/;
$compress_features |= $mask if $txt =~ m/^COMPRESS/;
}
}
unless (defined($ARGV[0]) && $ARGV[0] =~ m/cluster|cat/) {
my $act_dir = "$mars/actual-$host";
mkdir($act_dir) unless -d $act_dir;
$kernel_version = get_alive_link("tree", $host, 1);
if ($kernel_version && $user_version != $kernel_version) {
lwarn "kernel_version=$kernel_version user_version=$user_version\n";
if ($user_version < $kernel_version) {
ldie "Sorry, your MARS kernel module uses version $kernel_version, but my $0 userspace version is only $user_version. That cannot work. Please upgrade your userspace scripts!\n";
} elsif (int($user_version) > int($kernel_version) + 1) {
ldie "MAJOR VERSION mismatch : only one major upgrade step is supported.\n";
}
lwarn "using different minor versions is possible, but you should upgrade your kernel module ASAP\n";
}
}
# compute the mimimum of kernel features capabilities
foreach my $peer (get_any_peers()) {
my $features = get_alive_link("features", $peer, 1);
next unless (defined($features) && $features);
next unless $features =~ m/^([0-9]+),?([0-9]*),?([x0-9a-f]*)/;
$features = $1;
my $strategy = $2;
my $flags = $3;
next unless $features > 0;
if ($kernel_features_version < 0 || $features < $kernel_features_version) {
$kernel_features_version = $features;
}
if (defined($strategy) && $strategy) {
if ($strategy < $kernel_strategy_version || $kernel_strategy_version < 0) {
$kernel_strategy_version = $strategy;
}
} else {
$kernel_strategy_version = 0;
}
if (defined($flags) && $flags ne "") {
$flags = hex($flags) if $flags =~ m/^0x/;;
$kernel_flags_version &= $flags;
} else {
$kernel_flags_version = 0x0;
}
}
# can ssh be switched off?
if ($kernel_strategy_version >= 5) {
$ssh_port = 0;
}
# determine cluster-wide $compat_* values
if ($kernel_strategy_version >= 3) {
$compat_alivelinks = 0;
}
my $compat_path = "$mars/compat-deletions";
$compat_deletions = get_link($compat_path, 2);
if ($kernel_features_version < 3 || !defined($compat_deletions) || $compat_deletions eq "") {
$compat_deletions = 1;
}
}
sub get_alive_links {
my $res = shift;
my $alive = shift || "alive";
my $hosts = shift || "*";
my $warn = shift || 0;
my $non_participating = shift || 0;
$res = "*" if (!$res || $res eq "all" || $res =~ m/,/);
my $use_remote_stamp = $alive =~ s/^\^// ? 1 : 0;
my @peer_list;
if ($hosts eq $real_host) {
# needed at join-cluster when nothing is known about others
@peer_list = ($real_host);
} elsif ($non_participating) {
@peer_list = get_total_peers();
} else {
@peer_list = get_any_peers($res ne "*" ? $res : undef);
}
my %peers;
foreach my $peer (@peer_list) {
next if ($peer eq $host && $hosts ne $host);
# After join-cluster & co, links may take a while to appear
$peers{$peer} = 1 if $non_participating;
# peer must be a candiate matching the hosts spec
if ($hosts && $hosts ne "*") {
next unless $peer =~ m/(^|[+,{}])$hosts($|[+,{}])/;
}
# OK: remember peer
$peers{$peer} = 1;
}
$peers{$host} = 1 if $hosts eq $host;
return %peers unless %peers;
my %links;
foreach my $peer (keys(%peers)) {
my $val = get_alive_link($alive, $peer, 2);
# When required and possible, get the _remote_ timestamp
# when the link tree was read _remotely_.
if ($use_remote_stamp) {
my $remote_path = "$mars/actual-$peer/read-stamp";
my $remote_val = get_link($remote_path, 2);
# check compatibility to old versions, and downgrades
if ($compat_alivelinks && $remote_val) {
my $path = "$mars/$alive-$peer";
my $stamp1 = get_link_stamp($path);
my $stamp2 = get_link_stamp($remote_path);
# Try to prefer the new remote stamp.
# Only use the old one when too much outdated.
if ($stamp2 &&
(!$stamp1 || $stamp2 + $window >= $stamp1)) {
$val = $remote_val;
}
}
}
$links{$peer} = $val;
}
my $projection = $any_peers{$host};
if ($projection && $warn) {
my $now = mars_time();
my $extra_count = 0;
my %own_resources = %$projection;
foreach my $peer (keys(%peers)) {
my $stamp = get_alive_link("time", $peer, 2);
next if (!$stamp || $stamp !~ m/^\s*[0-9.]/ || is_recent($stamp));
my $common = 0;
my @peer_resources = get_any_resources($peer);
foreach my $tmp_res (@peer_resources) {
next unless $own_resources{$tmp_res};
$common++;
last;
}
my $age = seconds2human($now - $stamp);
if (!$common) {
# Non-member peers are updated much less frequently,
# thus we need a much larger time window.
if (!is_recent($stamp, 3600)) {
$extra_count++;
if ($verbose) {
lwarn "nonmember peer '$peer' not reachable for $age\n";
}
}
next;
}
my $msg = "no metadata is arriving from peer '$peer', age = $age";
$msg .= " => check your network setup" if is_module_loaded();
lwarn "$msg\n";
}
if ($extra_count) {
lwarn "$extra_count non-member peers are NOT REACHABLE for >= 1 hour\n";
}
}
return %links;
}
##################################################################
# timeout handling
#
# return the lamport clock time in nanosecond resolution
# fallback to system time()
#
sub mars_time {
my $lamport_time;
if (open(my $lamport_clock, "<", "/proc/sys/mars/lamport_clock")) {
while (<$lamport_clock>) {
$lamport_time = $1 if /^lamport_now=(.*)/;
}
close($lamport_clock);
}
return $lamport_time || time() . "." . '0' x 9;
}
my $timeout_val = undef;
sub sleep_timeout {
my $sleeptime = shift || 1;
my $continue = shift;
if ($timeout < 0) {
sleep($sleeptime);
return 0;
}
if (!defined($timeout_val)) {
$timeout_val = $timeout;
lprint "Resetting timeout to $timeout\n" if $verbose > 0;
}
if ($timeout_val <= 0) {
$timeout_val = undef;
if (!defined($continue) || !$continue) {
ldie "Timeout reached.\n";
}
lwarn "Timeout reached. Continuing anyway.\n";
return 1;
}
my $rest = $timeout_val;
$rest = $sleeptime if $rest > $sleeptime;
sleep($rest);
$timeout_val -= $rest;
return 0;
}
# wait for some condition
sub wait_cond {
my ($cmd, $res, $specific) = @_;
my $is_actual = ($specific =~ s/^(is|has)-//);
$specific =~ s/^todo-//;
my $is_on = !($specific =~ s/-(off|0)$//);
$specific =~ s/-(on|1)$//;
if ($is_actual) {
if ($specific eq "device") {
check_mars_device($cmd, $res, 1, !$is_on);
return;
}
my %table =
(
"attach" => "is-attached",
"attached" => "is-attached",
"replay" => "is-replaying",
"replaying"=> "is-replaying",
"fetch" => "is-copying",
"fetching" => "is-copying",
"copy" => "is-copying",
"copying" => "is-copying",
"sync" => "is-syncing",
"syncing" => "is-syncing",
"primary" => "is-primary",
"secondary"=> "is-primary",
);
my $name = $table{$specific};
ldie "actual indicator '$specific' does not exist\n" unless exists($table{$specific});
$is_on = !$is_on if $name eq "secondary";
check_status($cmd, $res, $name, $is_on ? 1 : 0, 1, 1);
} else {
my %table =
(
"attach" => "attach",
"attached" => "attach",
"fetch" => "fetch",
"connect" => "fetch",
"replay" => "replay",
"sync" => "sync",
"primary" => "primary",
"secondary" => "secondary",
);
my $name = $table{$specific};
ldie "button '$specific' does not exist\n" unless exists($table{$specific});
check_todo($cmd, $res, $name, $is_on ? 1 : 0, 1);
}
}
# wait until everything is recent
sub wait_cluster {
my ($cmd, $res, $hosts, $abort, $trigger_code, $condition) = @_;
$res = "all" if (!$res || $res =~ m/,/);
$hosts = "*" unless $hosts;
$abort = $force unless defined($abort);
$trigger_code = ($hosts =~ m/\*/ ? 3 : 2) unless $trigger_code;
my $non_participating = ($trigger_code >= 8);
$timeout_val = undef;
get_global_versions();
finish_links();
if ($hosts eq $real_host && $kernel_strategy_version >= 5) {
return 0;
}
my $max_restarts = 3;
my $max_resets = 1;
restart:
lprint "Ping $hosts trigger=$trigger_code\n";
_trigger($trigger_code);
my %old_status = get_alive_links($res, "time", $hosts, 0, $non_participating);
my $total_count = scalar(keys(%old_status));
my $start_time = mars_time();
my %progress;
foreach my $peer (keys(%old_status)) {
$progress{$peer} = $non_participating ? 1 : 0;
}
my $count = scalar(keys(%old_status));
if ($count && $count < 10) {
lprint "Wait for answers from " . join(",", sort alphanum_cmp keys(%old_status)) . "\n";
} else {
lprint "Wait for answers from $count peers\n";
}
my $delta = $window > 0 ? $window / 2 : 30;
while (1) {
# Early abort when condition is met
if ($condition) {
my $status = eval($condition);
lprint "Condition '$condition' status=$status\n" if $verbose;
return $status if $status;
}
my $dead_count = 0;
my $alive_count = 0;
my $unknown_count = 0;
# Notice: prefer the remote timestamps for race prevention
my %status = get_alive_links($res, "^time", $hosts, 0, $non_participating);;
my $now = mars_time();
foreach my $peer (keys(%status)) {
next if ($peer eq $host && $hosts ne $host);
if (!$status{$peer}) {
# This can happen at join-cluster & co
my $ip_link = "$mars/ips/ip-$peer";
my $ip_link_stamp = get_link_stamp($ip_link);
if ($ip_link_stamp &&
$ip_link_stamp + $delta < $now &&
$ip_link_stamp > 1) {
$dead_count++;
} elsif ($start_time + $delta < $now) {
$dead_count++;
} else {
$unknown_count++;
}
next;
} elsif (!$old_status{$peer}) {
# Oh, only the old info is outdated, but we have a new one.
# Also possible at join-cluster: restart freshly
lwarn "Need restart for getting more 'time' links\n";
goto restart if $max_restarts-- > 0;
}
if ($status{$peer} !~ m/^[0-9.]+$/) {
$dead_count++;
next;
}
if ($status{$peer} > $old_status{$peer} &&
$status{$peer} > $start_time &&
$progress{$peer}-- <= 0) {
$alive_count++;
} elsif ($status{$peer} + $delta < $now &&
$old_status{$peer} + $delta < $now) {
$dead_count++;
} else {
$unknown_count++;
}
}
if ($unknown_count) {
_trigger($trigger_code);
# ensure more progress will happen
%old_status = %status if !$max_resets++;
} else {
if (!$dead_count) {
lprint "$alive_count/$total_count peer(s) seem to be alive\n";
} else {
lwarn "$alive_count/$total_count peer(s) seem to be alive, and $dead_count peer(s) seem to be dead / not reachable\n";
ldie "aborting (--force was not given)\n" if $abort;
}
last;
}
sleep_timeout(1, 1);
last if $timeout_val <= 0 && !$unknown_count;
}
return 0;
}
sub wait_cluster_noforce {
my ($cmd, $res) = @_;
if (!$force) {
lprint "WAITING for communication\n" if $verbose;
wait_cluster($cmd, $res, "*", 0);
}
}
sub update_cluster {
my ($cmd, $res, $hosts, $condition) = @_;
$res = "all" if (!$res || $res =~ m/,/);
$hosts = "*" unless $hosts;
lprint "UPDATING $res\n" if $verbose;
my $status = wait_cluster($cmd, $res, $hosts, 0, 8, $condition);
_reset_resources();
return $status;
}
sub is_cluster_recent {
my ($cmd, $res, $hosts) = @_;
my $dead_count = 0;
my $alive_count = 0;
my $unknown_count = 0;
my $now = mars_time();
my %status = get_alive_links($res, "time", $hosts);
foreach my $peer (keys(%status)) {
next if $peer eq $host;
if ($status{$peer} + $window/2 >= $now) {
$alive_count++;
} elsif ($status{$peer} + $window < $now) {
$dead_count++;
} else {
$unknown_count++;
}
}
return ($dead_count, $alive_count, $unknown_count);
}
sub recent_cluster {
my $cmd = shift;
my $res = shift;
$res = "all" if (!$res || $res =~ m/,/);
my $hosts = shift || "*";
my ($dead_count, $alive_count, $unknown_count) = is_cluster_recent($cmd, $res, $hosts);
return 1 if (!$dead_count && !$unknown_count);
wait_cluster($cmd, $res, $hosts, 0);
return 0;
}
# Newer kernel modules do no longer work on non-member resources
# by defaults, for scalability reasons.
# Activate such a resource as a guest.
sub _activate_resource {
my ($cmd, $res, $peer) = @_;
$peer = $host unless $peer;
finish_links();
if (!$res || $res eq "*" || $res eq "all") {
lwarn "no usable resource to activate ($res).\n";
return;
}
my $resdir = "$mars/resource-$res";
my $max_retry = 3;
until (-d $resdir) {
my $condition = "return -d \"$resdir\";";
update_cluster($cmd, $res, "*", $condition);
last if -d $resdir;
if ($max_retry-- < 0) {
ldie "Resource directory '$resdir' does not exist\n";
}
lwarn "Resource directory '$resdir' does not yet exist, fetching...\n";
sleep(3);
}
lprint "OK, resource directory '$resdir' exists.\n";
my $check_link = "$resdir/size";
$max_retry = 3;
until (get_link($check_link, 1)) {
my $condition = "return get_link(\"$check_link\", 1);";
update_cluster($cmd, $res, "*", $condition);
if ($max_retry-- < 0) {
ldie "link '$check_link' does not exist.\n";
}
lwarn "link '$check_link' does not yet exist, fetching...\n";
sleep(3);
}
lprint "OK, link '$check_link' exists.\n";
my $active_path = "$resdir/device-$peer";
my $check = get_link($active_path, 2);
if ($check) {
if ($peer ne $real_host) {
_push_check($peer, "", $resdir);
_push_check($peer, "", $active_path);
}
lprint "Resource '$res' was already activated at '$peer'\n";
return;
}
lprint "ENABLING resource '$res' at '$peer'\n";
# Ensure guest communication: a faked device-$peer must exist
# for activation.
# Notice: this may be later overwritten with another value.
set_link($res, $active_path);
# prevent fallback to old device detection
my $actual_dir = "$resdir/actual-$peer";
mkdir($actual_dir);
my $if_path = "$actual_dir/if-on";
set_link("0", $if_path);
finish_links();
_push_check($peer, "", $active_path) if $peer ne $real_host;
# wait for (self-)activation
wait_cluster($cmd, $res, $peer, 0);
# activated peers might have changed
my $primary = _get_designated_primary($res, 1);
for (my $retry = 3; $retry >= 0; $retry--) {
_trigger(3);
my $condition = "return _get_designated_primary(\"$res\", 1);";
my $from = ($primary && $primary ne "(none)") ? $primary : "*";
my $old_primary = $primary;
$primary = update_cluster($cmd, $res, $from, $condition);
last if ($primary && $primary eq $old_primary);
sleep(3);
}
}
sub activate_guest {
my ($cmd, $res, $peer) = @_;
$peer = $host unless $peer;
my $resdir = "$mars/resource-$res";
if ($cmd =~ m/deactivate/) {
ldie "resource directory '$resdir' does not exist.\n" unless -d $resdir;
my $check_glob = "$resdir/*{data,replay}-$peer";
if (lamport_glob($check_glob)) {
ldie "peer '$peer' ist a storage member of resource '$res'\n";
}
my $guest_path = "$resdir/device-$peer";
_create_delete($guest_path);
return;
}
_activate_resource($cmd, $res);
ldie "Could not get resource name '$res'\n" unless -d $resdir;
}
##################################################################
# syntactic checks
# (also check for existence)
sub check_id {
my ($str, $must_exist) = @_;
ldie "identifier '$str' is empty" unless defined($str) && $str;
ldie "identifier '$str' has disallowed characters" unless $str =~ m/^[A-Za-z0-9_][-A-Za-z0-9_]*$/;
ldie "identifier '$str' is too long (only 63 chars supported according to RFC 1123)" if length($str) > 63;
if (defined($must_exist) && $must_exist) {
my $ip_path = "$mars/ips/ip-$str";
unless (get_link($ip_path, 1)) {
ldie "host '$str' does not exist in $mars/ips/\n" if !$force;
lwarn "host '$str' does not exist in $mars/ips/ - hopefully you know what you are doing via --force\n";
}
}
}
sub check_id_list {
my ($str, $must_exist) = @_;
ldie "comma-separated list '$str' has disallowed characters" unless $str =~ m/^[A-Za-z0-9_][-A-Za-z0-9_,]*$/;
foreach my $id (split(",", $str)) {
check_id($id, $must_exist);
}
}
##################################################################
# semantic checks
sub check_res {
my $res = shift;
wait_cluster("wait-cluster", "all", "*", 0) unless -d "$mars/resource-$res";
if (not -d "$mars/resource-$res") {
# Ensure that all cluster information is present
# DO WHAT I MEAN: try to substitute a device name for a badly given resource name if it is unique
my $count = 0;
my $found;
my @tests = lamport_glob("$mars/resource-*/device-$host");
foreach my $test (@tests) {
my $target = get_link($test, 2);
if ($target eq $res) {
$found = $test;
$count++;
}
}
if (!$count) {
@tests = lamport_glob("$mars/resource-*/_direct-*-$host");
foreach my $test (@tests) {
my $target = get_link($test, 2);
$target =~ s/^.*,//;
if ($target eq $res) {
$found = $test;
$count++;
}
}
}
ldie "resource '$res' does not exist ($count replacements found)\n" unless $count == 1 and $found;
$found =~ s:^.*/resource-(.*)/.*$:$1:;
lwarn "substituting bad resource name '$res' by uniquely matching resource name '$found'\n";
$res = $found;
}
return $res;
}
sub _get_mars_size {
my ($cmd, $res) = @_;
my $dev_name = get_link("$mars/resource-$res/device-$host");
my $info = "/sys/devices/virtual/block/mars!$dev_name/size";
return `cat $info` * 512;
}
sub check_sizes {
my ($res, $peer) = @_;
my $physical_size = get_link("$mars/resource-$res/actsize-$peer", 2) || return;
my $logical_size = get_link("$mars/resource-$res/size", 1);
if (defined($physical_size) && $physical_size < $logical_size) {
lwarn "Physical device on host '$peer' has size $physical_size, which is smaller than the logical resource size $logical_size\n";
ldie "This is too dangerous. It cannot work. Fix it!\n" unless $force;
}
}
sub check_res_member {
my ($cmd, $res) = @_;
if (! link_exists("$mars/resource-$res/data-$host")) {
if (link_exists("$mars/resource-$res/device-$host")) {
# guest
return;
}
if (link_exists("$mars/resource-$res/replay-$host")) {
lwarn "Resource '$res' seems to have been destroyed.\n";
lwarn "Nevertheless, a replay link exists for host '$host'.\n";
lwarn "This can happen after 'leave-resource --host=$host' while host $host was active.\n";
lwarn "mars-user-manual.pdf forbids any usage of such a resource _strongly_.\n";
lwarn "In order to finally remove this resource from $host, use the\n";
lwarn "command 'marsadm leave-resource --force $res'\n";
} else {
lwarn "Sorry, I have not yet joined to resource '$res'\n";
}
ldie "Refusing work on resource name '$res'\n" unless $force;
lwarn "Running '$cmd' is dangerous, continuing on your own risk\n" unless $cmd eq "leave-resource";
}
check_sizes($res, $host);
}
sub _sync_finished {
my ($res, $peer) = @_;
my $lnk = "$mars/resource-$res/syncstatus-$peer";
return 0 unless link_exists($lnk);
my $syncstatus = get_link($lnk, 1);
my $size = get_link("$mars/resource-$res/size");
return 0 if ($size <= 0);
return 0 if ($syncstatus < $size);
my $syncpos = get_link("$mars/resource-$res/syncpos-$peer", 2);
if (defined($syncpos) && $syncpos) {
my $replay = get_link("$mars/resource-$res/replay-$peer", 1);
return 0 unless $replay;
my $cmp = compare_replaylinks($syncpos, $replay);
return 0 if $cmp > 0;
}
return 1;
}
sub check_sync_finished {
my ($res, $peer, $do_force) = @_;
check_sizes(@_);
if (!_sync_finished($res, $peer)) {
lwarn "Sync has not yet finished on host '$peer'\n";
if ($peer eq $host) {
lwarn "Don't try to make inconsistent host '$host' the new primary!\n";
lwarn "Please wait until sync has finished and all logfile have been replayed.\n";
ldie "Refusing to switch inconsistent host '$host' to primary\n";
} else {
lwarn "Changing the primary role during sync is dangerous for data consistency on host '$peer'!\n";
}
unless ($do_force) {
lprint "HINT: consider the option --ignore-sync if you are sure that you want to restart the sync\n";
ldie "First wait for sync finished before primary handover, or use --ignore-sync\n";
}
}
lprint "OK, it seems that sync has finished on host '$peer'.\n";
}
sub check_primary {
my ($cmd, $res, $no_designated, $no_fail) = @_;
my $fail = 0;
my $lnk = "$mars/resource-$res/actual-$host/is-primary";
my $is_primary = get_link($lnk, 1);
if (!$is_primary) { # give it a second chance
$is_primary = device_exists($res);
}
unless ($is_primary) {
lwarn "For operation '$cmd' I need to be primary\n";
$fail++;
}
if ($no_designated) {
my $primary = _get_designated_primary($res);
unless ($primary eq $host) {
lwarn "For operation '$cmd', I also must be the designated primary\n";
}
}
unless (defined($no_fail) && $no_fail) {
ldie "Operation '$cmd' only works at the primary side\n" if $fail;
}
return $fail;
}
sub check_not_primary {
my ($cmd, $res, $allow_force) = @_;
my $lnk = "$mars/resource-$res/actual-$host/is-primary";
my $is_primary = get_link($lnk, 1);
if ($is_primary) {
if (!$force || !defined($allow_force) || !$allow_force) {
ldie "operation '$cmd' cannot be executed on actual primary\n";
}
lwarn "operation '$cmd' is forced on actual primary '$host', THIS IS RISKY\n";
}
# also check whether we intend to become primary
my $primary = _get_designated_primary($res, 1);
if ($primary eq $host) {
lwarn "operation '$cmd' cannot be executed on designated primary\n";
ldie "first switch the designated primary, or use --force if you are sure.\n" unless $force;
lwarn "continuing anyway due to --force\n";
}
}
sub check_primary_gone {
my ($cmd, $res, $peer) = @_;
return 0 unless (defined($peer) && $peer && $peer ne "(none)");
return 0 if $peer eq $host;
my $lnk = "$mars/resource-$res/actual-$peer/is-primary";
my $is_primary = get_link($lnk, 1);
if (defined($is_primary) && $is_primary eq "0") {
lprint "OK, other host ($peer) is not actual primary.\n";
return 0;
}
lprint "waiting for other primary host ($peer) to disappear....\n";
return 1;
}
sub _make_messages {
my ($cmd, $res, $key, $val, $wait, $unchecked, $inv) = @_;
my %table =
(
"is-attached" => "attach",
"is-syncing" => "sync",
"is-copying" => "fetch",
"is-replaying" => "replay",
"is-primary" => "primary",
);
my $key_msg = defined($table{$key}) ? $table{$key} : $key;
my $val_msg = $val ? "on" : "off";
my $wait_msg = $wait ? "waiting $timeout until" : "ensuring that";
my $new_val = (defined($inv) && $inv) ? !$val : $val;
my $action_msg = ($new_val ? "'resume-" : "'pause-") . $key_msg . "'";
if ($key_msg eq "primary") {
$action_msg = $new_val ? "'$key_msg'" : "'secondary' (or better 'primary' on another host)";
} elsif ($key_msg eq "attach") {
$action_msg = $new_val ? "'$key_msg'" : "'detach'";
}
$action_msg = "Use $action_msg and/or wait until it has really succeeded.";
$action_msg .= " Notice that MARS is an ASYNCHRONOUS system, where both execution of actions, as well as propagation over the network may take some time." if $verbose;
$action_msg .= " Don't expect magic to happen if the network has some problems, or when the system load is extremely high." if $verbose > 1;
$action_msg .= " Please read the PDF manual, and try to understand it." if $verbose > 2;
return ($key_msg, $val_msg, $wait_msg, $action_msg);
}
sub check_todo {
my ($cmd, $res, $key, $val, $wait, $unchecked, $inv) = @_;
$key =~ s/fetch/connect/;
my $path = "$mars/resource-$res/todo-$host/$key";
$path = correct_path($path);
my $link;
for (;;) {
$link = get_link($path, $unchecked);
return unless defined($link);
$link = ($link eq $host) ? 1 : 0 if $key eq "primary";
$link = ($link eq "(none)") ? 1 : 0 if $key eq "secondary";
my ($key_msg, $val_msg, $wait_msg, $action_msg) = _make_messages(@_);
if (defined($inv) && $inv) {
last if $link != $val;
lprint "$wait_msg switch '$key' != '$val'....\n";
ldie "Cannot execute $cmd on resource $res: todo-switch '$key_msg' must not be $val_msg. $action_msg\n" if !$wait;
} else {
last if $link == $val;
lprint "$wait_msg switch '$key' == '$val'....\n";
ldie "Cannot execute $cmd on resource $res: todo-switch '$key_msg' must be $val_msg, but actually has value '$link'. $action_msg\n" if !$wait;
}
sleep_timeout();
}
lprint "OK, '$path' has acceptable value '$link'\n";
}
sub _run_compensation {
my ($compensation) = @_;
if (defined($compensation)) {
my $txt = $compensation;
$txt =~ s/\n/\\n/mg;
lprint "Running compensation action '$txt'\n";
eval $compensation;
}
}
sub check_status {
my ($cmd, $res, $key, $val, $wait, $unchecked, $inv, $peer, $action, $compensation) = @_;
$peer = $host unless defined($peer);
my $path = correct_path("$mars/resource-$res/actual-$peer/$key");
my $link;
my $rounds = 0;
my $fail_round = 10;
for (;;) {
$link = get_link($path, $unchecked);
$link = 0 unless (defined($link) && $link ne "");
my ($key_msg, $val_msg, $wait_msg, $action_msg) = _make_messages(@_);
if (defined($inv) && $inv) {
last if $link != $val;
lprint "at $peer: $wait_msg actual '$key' != '$val'...\n";
ldie "Cannot execute $cmd on resource $res: actual '$key_msg' must not be $val_msg. $action_msg Also ensure that your command _can_ succeed.\n" if !$wait;
} else {
last if $link == $val;
lprint "at $peer: $wait_msg actual '$key' == '$val'...\n";
ldie "Cannot execute $cmd on resource $res: actual '$key_msg' must be $val_msg. $action_msg Also ensure that your command _can_ succeed.\n" if !$wait;
}
if (defined($action) && $action && $rounds > 1) {
lprint "action: $action\n" if $verbose;
my $action_status = 0;
my $old_error_count = $error_count;
eval "$action";
$error_count = $old_error_count;
# Tolerate intermediate failures for some time
if ($action_status && $rounds > $fail_round) {
_run_compensation($compensation);
ldie "Action failure, status=$action_status\n";
}
}
my $status = sleep_timeout(undef, 1);
if ($status) {
_run_compensation($compensation);
ldie "Timeout\n";
}
$rounds++;
}
lprint "OK at $peer: '$path' has acceptable value '$link'\n";
}
sub check_mars_device {
my ($cmd, $res, $wait, $inv) = @_;
my $dev = device_name($res);
my $backoff = 1;
my $round = 0;
if ($inv) {
while (device_exists($res)) {
ldie "cannot execute $cmd: device '$dev' has not yet disappeared\n" if !$wait;
lwarn "device '$dev' has not yet disappeared\n";
sleep_timeout($backoff);
# very slowly increasing backoff
if ($backoff < 10 && $round++ > 5) {
$round = 0;
$backoff++;
}
systemd_activate($cmd, $res, 0, 1);
}
lprint "device '$dev' is no longer present\n" unless device_exists($res);
return;
}
# !$inv
my $primary = _get_designated_primary($res);
ldie "for operation '$cmd', I should be the designated primary\n" unless $primary eq $host;
while (!device_exists($res)) {
my $text = get_error_text($cmd, $res);
lprint $text if $text;
ldie "aborting due to errors\n" if $text =~ m/error/mi;
ldie "cannot execute $cmd: device '$dev' not yet present\n" if !$wait;
lprint "device '$dev' not yet present\n";
sleep_timeout($backoff);
# very slowly increasing backoff
if ($backoff < 10 && $round++ > 5) {
$round = 0;
$backoff++;
}
}
lprint "device '$dev' is present\n" if device_exists($res);
}
sub check_userspace {
my ($dst) = @_;
if ($dst !~ m:/userspace/:) {
ldie "your path '$dst' must be inside $mars/userspace/ or $mars/resource-*/userspace/\n" unless $force;
lwarn "your path '$dst' is outside $mars/userspace/ or $mars/resource-*/userspace/ and you gave --force, hopefully YOU KNOW WHAT YOU ARE DOING\n";
}
}
sub check_sync_startable {
my ($cmd, $res) = @_;
my $primary = _get_designated_primary($res);
ldie "Cannot execute '$cmd' because noone is designated as primary.\n" if (!$primary || $primary eq "(none)");
# no danger when switch is turned on at the primary side.
return if $primary eq $host;
my $emergency_path = "$mars/resource-$res/actual-$primary/has-emergency";
my $emergency = get_link($emergency_path, 1);
if ($emergency) {
ldie "Primary '$primary' is in emergency mode. Cannot start sync.\nFree some space there first.\n";
}
}
##################################################################
# state inspection routines
sub _get_minmax {
my ($res, $glob, $take_symlink) = @_;
my $min = -1;
my $max = -1;
my @paths = lamport_glob($glob) or lwarn "cannot find '$glob'\n";
foreach my $path (@paths) {
my $nr = $path;
if ($take_symlink) {
$nr = get_link($path, 1);
}
$nr =~ s@^(?:.*/)?[a-z]+-([0-9]+)(-[^/]*)?$@$1@;
$min = $nr if ($nr < $min || $min < 0);
$max = $nr if ($nr > $max || $max < 0);
}
return ($min, $max);
}
sub get_minmax_logfiles {
my ($res, $peer) = @_;
$peer = "" unless defined($peer);
return _get_minmax($res, "$mars/resource-$res/log-*$peer", 0);
}
sub get_minmax_versions {
my ($res, $peer) = @_;
$peer = "" unless defined($peer);
return _get_minmax($res, "$mars/resource-$res/version-*$peer", 0);
}
sub get_minmax_replays {
my ($res, $peer) = @_;
$peer = "" unless defined($peer);
return _get_minmax($res, "$mars/resource-$res/replay-*$peer", 1);
}
##################################################################
# generic comparisons
sub compare_replaylinks {
my ($a, $b) = @_;
$a =~ m/log-([0-9]+)[^,]*,([0-9]+)/;
my ($a_log, $a_pos) = ($1, $2);
$b =~ m/log-([0-9]+)[^,]*,([0-9]+)/;
my ($b_log, $b_pos) = ($1, $2);
return -1 if $a_log < $b_log;
return +1 if $a_log > $b_log;
return -1 if $a_pos < $b_pos;
return +1 if $a_pos > $b_pos;
return 0;
}
##################################################################
sub get_amount {
my ($resdir, $host, $direction, $only_files) = @_;
my $level = 0;
my $firstpos = 0;
my $logpos = 0;
my $oldpos = 0;
my $sum = 0;
for (;;) {
my $val0 = 0;
my $ok = 0;
if (!$level) {
my $replay_path = sprintf("%s/replay-%s", $resdir, $host);
my $replay_link = get_link($replay_path, 1);
return (0, 0, 0, 0) if !$replay_link;
return (0, 0, 0, 0) if $replay_link !~ m:log-([0-9]+)-[^,]+,([0-9]+),([0-9]+):;
$firstpos = int($1);
$logpos = $firstpos;
$oldpos = $firstpos;
if ($direction < 0) {
$sum = $2;
} else {
$val0 = $2;
$sum = $3;
}
}
if ($level > 0 || $direction > 0) {
my $file = sprintf("%s/log-%09d-%s", $resdir, $logpos, $host);
my @stat = stat($file);
my $val = 0;
if (@stat && file_exists($file)) {
$val = $stat[7];
$ok = 1;
} else {
my $glob = sprintf("%s/log-%09d-*", $resdir, $logpos);
foreach $file (lamport_glob($glob)) {
my @tstat = stat($file);
if (@tstat && $tstat[7] > $val) {
@stat = @tstat;
$val = $stat[7];
$ok = 1;
}
}
}
if (!$only_files) {
my $glob = sprintf("%s/version-%09d-*", $resdir, $logpos);
foreach $file (lamport_glob($glob)) {
my $vers_link = get_link($file, 1);
if ($vers_link && $vers_link =~ m;,([0-9]+):;) {
my $nval = $1;
$val = $nval if $nval > $val;
$ok = 1;
}
}
}
return ($sum, $firstpos, $oldpos, $level) unless $ok;
if ($level > 0) {
$sum += $val;
} else {
# the logfile may be bigger than the replay pos/length
my $new_sum = $val - $val0;
$sum = $new_sum if $new_sum > $sum;
$sum = 0 if $sum < 0;
}
}
$oldpos = $logpos;
if ($direction > 0) {
$logpos++;
} elsif ($direction < 0) {
$logpos--;
return ($sum, $firstpos, $oldpos, $level) if $logpos < 1;
} else {
return ($sum, $firstpos, $oldpos, $level);
}
$level++;
}
}
##################################################################
# versionlink path handling routines
my %visited_pos;
sub _visit {
my ($nr, $peer) = @_;
$nr =~ s:^0*::;
my $visit = "$nr,$peer";
$visited_pos{$visit} = 1;
}
sub _is_visited {
my ($nr, $peer) = @_;
$nr =~ s:^0*::;
my $visit = "$nr,$peer";
return $visited_pos{$visit};
}
sub _parse_pos {
my ($basedir, $pos) = @_;
if ($pos =~ m/log-([0-9]+)-([^,]+)/) {
_visit($1, $2);
} elsif ($pos =~ m/version-([0-9]+)-([^,]+)/p) {
my $vers = get_link("$basedir/$MATCH");
my $count_matches = 0;
while ($vers =~ m/log-([0-9]+)-([^,]+)/p) {
$vers = $POSTMATCH;
_visit($1, $2);
$count_matches++;
}
lwarn "cannot parse '$pos' -> '$vers'\n" unless $count_matches;
} else {
lwarn "cannot parse '$pos'\n";
}
$pos =~ m/((?:log|version)-([0-9]+)-([^,]+)(?:,([0-9]+))?)/ or lwarn "cannot parse position info '$pos'\n";
return ($1, int($2), $3, defined($4) ? int($4) : -1);
}
sub _get_prev_pos {
my ($basedir, $nr, $peer) = @_;
my $path = sprintf("version-%09d-$peer", $nr);
my $vers = get_link("$basedir/$path", 2);
if (!defined($vers) || !$vers) {
# Scarce race.
# This can happen when new logfiles are present but not yet worked on.
# Also improves robustness on damaged filesystems.
# Decrement nr by hand, try, take the first part.
$path = sprintf("version-%09d-$peer", $nr - 1);
$vers = get_link("$basedir/$path", 2);
$vers =~ s/:.*// if defined($vers) && $vers;
} else {
# take the last part, pointing to the predecessor versionlink.
$vers =~ s/^.*://;
}
_parse_pos($basedir, $path) if defined($vers) && $vers;
return $vers;
}
sub _get_common_ancestor {
my ($total1, $total2) = (0, 0);
for (;;) {
my ($basedir, $pos1, $host1, $dep1, $pos2, $host2, $dep2) = @_;
my ($p1, $nr1, $from1, $len1) = _parse_pos($basedir, $pos1);
my ($p2, $nr2, $from2, $len2) = _parse_pos($basedir, $pos2);
if ($p1 eq $p2) {
# usually no split brain here (only if both path depths are non-zero)
my $split = ($dep1 && $dep2);
if (!$split) { # additionally check the corresponding version links
my $path1 = sprintf("$basedir/version-%09d-$from1", $nr1);
my $path2 = sprintf("$basedir/version-%09d-$from2", $nr2);
if (my $vers1 = get_link($path1, 1) and my $vers2 = get_link($path2, 1)) {
if ($vers1 ne $vers2) {
$split = 1;
$total1 += $len1;
$total2 += $len2;
}
}
}
return ($p1, $split, $total1, $total2);
} elsif ($nr1 > $nr2) {
# just flip arguments
@_ = ($basedir, $pos2, $host2, $dep2, $pos1, $host1, $dep1);
($total1, $total2) = ($total2, $total1);
next;
} elsif ($nr1 < $nr2) {
# recursively advance path depth
$total2 += $len2;
my $vers2 = _get_prev_pos($basedir, $nr2, $host2);
return ("", -1, $total1, $total2) if !$vers2;
@_ = ($basedir, $pos1, $host1, $dep1, $vers2, $host2, $dep2 + 1);
next;
} elsif ($from1 ne $from2) {
# split brain is sure now, but continue computing the common split point
my $vers1 = _get_prev_pos($basedir, $nr1, $host1);
return ("", 1, $total1 + $len1, $total2 + $len2) if !$vers1;
my $vers2 = _get_prev_pos($basedir, $nr2, $host2);
return ("", 1, $total1 + $len1, $total2 + $len2) if !$vers2;
my ($res, $split, $t1, $t2) = _get_common_ancestor($basedir, $vers1, $host1, $dep1 + 1, $vers2, $host2, $dep2 + 1);
return ($res, 1, $t1 + $total1 + $len1, $t2 + $total2 + $len2);
} elsif ($len1 < $len2) {
# there may be no split brain (just incomplete replay) depending on path depth
return ($p1, $dep1, $total1 + $len1, $total2 + $len2);
} elsif ($len2 < $len1) {
# dto symmetric
return ($p2, $dep2, $total1 + $len1, $total2 + $len2);
}
lwarn "error in algorithm: $p1, $nr1, $from1, $len1 : $p2, $nr2, $from2, $len2\n";
return ("", -1, 0, 0);
}
}
sub get_common_ancestor {
my ($basedir, $host1, $host2) = @_;
my $repl1 = get_link("$basedir/replay-$host1", 1);
my $repl2 = get_link("$basedir/replay-$host2", 1);
return _get_common_ancestor($basedir, $repl1, $host1, 0, $repl2, $host2, 0);
}
my %detected_splits = ();
sub detect_splitbrain {
my ($res, $do_report) = @_;
# dynamic programming
return $detected_splits{$res} if defined($detected_splits{$res});
my $basedir = "$mars/resource-$res";
my $ok = 1;
my @list = lamport_glob("$mars/resource-$res/replay-*");
my @hosts = map { $_ =~ s:.*/replay-::; $_ } @list;
foreach my $host1 (@hosts) {
foreach my $host2 (@hosts) {
next if $host1 ge $host2;
my ($point, $split, $size1, $size2);
for (my $retry = 2; $retry > 0; $retry--) {
($point, $split, $size1, $size2) = get_common_ancestor($basedir, $host1, $host2);
last if !$split;
# Workaround races.
sleep(0);
}
if ($split) {
$ok = 0;
if ($do_report) {
my $age = "";
if ($point) {
my $log = "$basedir/$point";
$log =~ s:,.+::;
my $stamp = get_link_stamp($log);
my $vers = $log;
$vers =~ s:/log-:/version-:;
my $stamp2 = get_link_stamp($vers);
# take the minimum
$stamp = $stamp2 if !$stamp || ($stamp2 && $stamp2 < $stamp);
$age = " age ~" . seconds2human(mars_time() - $stamp) if $stamp;
}
lwarn "SPLIT BRAIN of resource '$res' after logfile '$point'$age\n";
if ($point) {
lwarn " hostA = '$host1' logfile_amount='$size1' (" . number2human($size1) . ")\n";
lwarn " hostB = '$host2' logfile_amount='$size2' (" . number2human($size2) . ")\n";
}
} else {
return $ok;
}
}
}
}
if ($ok) { # check for duplicate logfiles
my @logs = lamport_glob("$mars/resource-$res/log-*");
my $oldnr = -1;
foreach my $path (sort alphanum_cmp @logs) {
$path =~ m:/log-([0-9]+):;
my $nr = $1;
if ($nr == $oldnr) {
$ok = 0;
lwarn "SPLIT BRAIN at resource '$res' detected: duplicate logfile number $nr\n";
lwarn "hint: first resolve split brain by 'leave-resource' or 'invalidate'\n";
lwarn "hint: if this does not help, try cleanup via 'log-purge-all'\n";
lwarn "hint: if this does not help, try 'log-purge-all --force'\n";
last;
}
$oldnr = $nr;
}
}
$detected_splits{$res} = $ok;
return $ok;
}
sub _mark_path_backward {
my ($basedir, $pos, $peer, $skip, $jump_peer) = @_;
my $sum = 0;
my $base_nr = 0;
for (;;) {
my ($p, $nr, $from, $len) = _parse_pos($basedir, $pos);
last if defined($skip) && $nr < $skip;
$base_nr = $nr;
_visit($nr, $peer);
# When following chains from foreign hosts (e.g. the designated primary),
# we must jump over to our own chain somewhen, because the lengths of
# the chains may be different (caused by invalidate & friends).
if (defined($jump_peer) && $jump_peer ne $peer) {
my $peer_path = sprintf("$basedir/version-%09d-$peer", $nr);
my $jump_path = sprintf("$basedir/version-%09d-$jump_peer", $nr);
my $peer_version = get_link($peer_path, 2);
my $jump_version = get_link($jump_path, 2);
if (defined($peer_version) && defined($jump_version) && $peer_version eq $jump_version) {
$peer = $jump_peer;
}
}
$pos = _get_prev_pos($basedir, $nr, $peer, 1);
last if !$pos;
# optionally don't count the last versionlink, pointing into nirvana
if (defined($skip) && $skip && $nr > 1) {
my ($p, $nr, $from, $len) = _parse_pos($basedir, $pos);
last if !$p;
my $next = _get_prev_pos($basedir, $nr, $peer, 1);
last if !$next;
}
$sum += $len;
}
return ($sum, $base_nr);
}
sub _mark_path_forward {
my ($basedir, $pos, $peer) = @_;
my @list = ($pos);
while (@list) {
my %next_list;
foreach $pos (@list) {
my ($p, $nr, $from, $len) = _parse_pos($basedir, $pos);
my $cand = sprintf("$basedir/version-%09d-$peer", $nr + 1);
my $vers = get_link($cand, 2);
next unless defined($vers) && $vers ne "";
$vers =~ s/^.*://;
my ($cp, $cnr, $cfrom, $clen) = _parse_pos($basedir, $vers);
if (int($cnr) == int($nr) && $cfrom eq $from && $clen == $len) {
$next_list{$cand} = 1;
}
}
@list = keys(%next_list);
}
}
sub _mark_path_transitive {
_mark_path_forward(@_);
_mark_path_backward(@_);
}
sub log_purge_res {
my ($cmd, $res) = @_;
lwarn "DANGEROUS OPERATION: $cmd --force on resource '$res'\n" if $force;
%visited_pos = ();
my %logs;
my %start_logs;
my $basedir = "$mars/resource-$res";
my $max_retry = 3;
retry:
my $start_count = 0;
my %situation = ();
foreach my $data (lamport_glob("$basedir/{data,replay}-*")) {
$data =~ m:/(data|replay)-(.+):;
my $peer = $2;
my $replay = "$basedir/replay-$peer";
my $target = get_link($replay, 1);
next unless $target;
lprint "found replay link '$replay' -> '$target'\n";
# only the logfile number is relevant, not the positions
$target =~ s/,.*//;
$situation{$replay} = $target;
$start_logs{$target}++;
$start_count++;
_mark_path_transitive($basedir, $target, $peer);
}
if (!$start_count) {
lprint "Resource '$res' contains no valid information - there is nothing to purge\n";
return;
}
my %to_delete = ();
foreach my $file (lamport_glob("$basedir/version-*")) {
$file =~ m:/(version-([0-9]+)-([^,]+)): or ldie "bad path '$file'\n";
my $cand = $1;
my $nr = $2;
my $from = $3;
lprint "checking '$cand'\n";
my $vers = get_link($file, 1);
$vers =~ m/(log-[0-9]+-[^,:]+)/;
my $log = $1;
lprint " corresponding logfile is '$log'\n";
if (_is_visited($nr, $from)) {
lprint " ok '$cand'\n";
$logs{$log}++;
next;
}
if (!$force && $from ne $host) {
lprint " skipping foreign object '$cand'\n";
$logs{$log}++;
next;
}
lwarn "deleting foreign object from peer '$from' because you said --force\n" if $from ne $host;
$to_delete{$file}++;
}
foreach my $file (lamport_glob("$basedir/log-*")) {
$file =~ m:/(log-[0-9]+-(.*)): or ldie "bad path '$file'\n";
my $log = $1;
my $from = $2;
lprint "checking '$log'\n";
if ($logs{$log}) {
lprint " ok '$log'\n";
$logs{$log} = -1;
next;
}
if ($start_logs{$log}) {
lprint " ok start '$log'\n";
$logs{$log} = -1;
next;
}
if (!$force && is_link_recent($log)) {
lprint " skipping recent object '$log'\n";
next;
}
if (!$force && $from ne $host) {
lprint " skipping foreign object '$log'\n";
next;
}
lwarn "deleting foreign object from peer '$from' because you said --force\n" if $from ne $host;
$to_delete{$file}++;
}
# check for any races in the initial situation
my $nr_races = 0;
foreach my $replay (sort keys(%situation)) {
my $old_situation = $situation{$replay};
my $target = get_link($replay, 1);
# only the logfile number is relevant, not the positions
$target =~ s/,.*//;
if (!$target || $target ne $old_situation) {
lwarn "Race on '$replay' -> '$target' instead of '$old_situation'\n";
$nr_races++;
}
}
if ($nr_races) {
if ($max_retry-- > 0) {
lwarn "Restarting due to $nr_races races\n";
sleep(1);
goto retry;
}
ldie "Detected $nr_races, the situation is not stable\n";
}
# POINT OF NO RETURN
foreach my $file (sort keys(%to_delete)) {
_create_delete($file);
}
my $count = 0;
foreach my $log (sort alphanum_cmp keys(%logs)) {
my $nr = $logs{$log};
next if $nr < 0 || any_exists("$basedir/$log");
lprint_stderr "info: logfile '$log' is referenced ($nr), but not present.\n";
$count++;
}
if ($count) {
lprint_stderr " Unreferenced logfiles are not necessarily bad.\n";
lprint_stderr " They can regularly appear after 'leave-resource',\n";
lprint_stderr " or 'invalidate', or after emergency mode,\n";
lprint_stderr " or after similar operations.\n";
}
finish_links();
}
sub err_purge_res {
my ($cmd, $res) = @_;
foreach my $leftlink (lamport_glob("$mars/{,resource-$res/}actual-$host/msg-err-*")) {
lprint " deleting '$leftlink'\n" if $verbose;
_create_delete($leftlink);
}
finish_links();
}
sub try_to_avoid_splitbrain {
my ($cmd, $res, $old_primary) = @_;
my $old_timeout = $timeout;
$old_primary = "" if $old_primary eq "(none)";
if (!detect_splitbrain($res, 0)) {
lwarn "ATTENTION: you are starting a non-forced primary switchover in a split brain situation.\n";
lwarn "ATTENTION: that's no good idea.\n";
lwarn "ATTENTION: I will continue to do what you want.\n";
lwarn "ATTENTION: But you are responsible for the consequences.\n";
return 0;
}
# now try to prevent producing a _new_ split brain situation....
my @host_list = lamport_glob("$mars/resource-$res/replay-*");
return 0 if scalar(@host_list) < 2;
my ($min, $max) = get_minmax_versions($res);
my $vers_glob = sprintf("$mars/resource-$res/version-%09d-*", $max);
my $ok = 1;
my $replay_err_path = "$mars/resource-$res/actual-$host/msg-err-replay-stop";
my $replay_err = get_link($replay_err_path, 1);
if ($replay_err && $replay_err ne "OK") {
my @stat = lstat($replay_err_path);
if ($stat[9] + $window >= mars_time()) {
my $msg = _get_text($replay_err);
lwarn "cannot guarantee split brain avoidance: $msg\n";
return 0;
}
}
my $emergency_path = "$mars/resource-$res/actual-$host/has-emergency";
my $emergency = get_link($emergency_path, 1);
if ($emergency) {
ldie "emergency mode $emergency has been entered locally: handover is not possible. Either free some space in $mars/, or use --force to use a potentially outdated version.\n";
}
my $primary = _get_designated_primary($res, -1);
if ($primary eq "(none)") {
# try to determine the old primary when unique
my $glob_logs = sprintf("$mars/resource-$res/log-%09d-*", $max);
my @candidates = lamport_glob($glob_logs);
if (scalar(@candidates) == 1) {
my $log_path = pop @candidates;
if ($log_path =~ m:/log-[0-9]+-(.+)$:) {
$primary = $1;
lprint "Using last primary '$primary' as a substitute.\n";
}
}
}
if ($primary && $primary eq $host) {
lprint "Switching back to last primary.\n";
return 0;
}
# if the old primary is known, we can ignore all other / unrelated hosts
if ($primary && $primary ne $host && $primary ne "(none)") {
my $path_p = "$mars/resource-$res/replay-$primary";
my $path_h = "$mars/resource-$res/replay-$host";
my $replay_p = get_link($path_p);
my $replay_h = get_link($path_h);
my $p_path = sprintf("$mars/resource-$res/version-%09d-%s", $max, $primary);
my $h_path = sprintf("$mars/resource-$res/version-%09d-%s", $max, $host);
my $p_vers = get_link($p_path, 1);
my $h_vers = get_link($h_path, 1);
if (!$p_vers || !$h_vers || $p_vers ne $h_vers) {
# Safeguard against artificial or missing version links
# (e.g. after crashes etc):
# when replay links are equal, we are safe.
my $stamp_p = get_link_stamp($path_p);
my $stamp_h = get_link_stamp($path_h);
my $p_stamp = get_link_stamp($p_path);
my $stable_stamp = mars_time() - $window / 4;
if ($replay_p eq $replay_h &&
$p_vers &&
$p_stamp < $stable_stamp &&
$stamp_p < $stable_stamp && $stamp_h < $stable_stamp) {
# self-healing the versionlink
lwarn "Correcting the versionlink '$h_path' from '$h_vers' to '$p_vers'\n";
set_link($p_vers, $h_path);
finish_links();
} else {
$ok = 0;
}
} elsif (!$replay_p || !$replay_h || $replay_p ne $replay_h) {
$ok = 0;
}
} else {
# old primary is unknown: we have no chance, other than comparing _all_ versions.
my @versions = lamport_glob($vers_glob);
my $first = get_link(shift @versions);
while (@versions) {
my $next = get_link(shift @versions);
if ($next ne $first) {
$ok = 0;
}
}
}
return 0 if $ok;
lprint "Trying to avoid split brain for $timeout s: logfile update not yet completed.\n";
my $tpl = get_macro("replinfo");
my $new_situation = eval_macro($cmd, $res, $tpl, @_);
print $new_situation;
# condition not met
return 1;
}
sub get_size {
my $arg = shift;
my $orig_arg = $arg;
# Do what I mean: when given a device, take its actual size.
if (-b $arg) {
my $mangled = $arg;
$mangled =~ s:/dev/::;
$mangled =~ s:/:!:g;
my $path = "/sys/block/$mangled/size";
$arg = `cat $path` * 512 if -r $path;
}
if ($arg !~ m/^([0-9]+(?:\.[0-9]*)?)([kmgtp]?)$/i) {
ldie "Size argument '$arg' must be a number, optionally followed by a suffix [kKmMgGtTpP]. Lowercase = multiples of 1000, Uppercase = multiples of 1024.\n";
}
my $mod = $2 || "";
$arg = $1;
$_ = $mod;
SWITCH: {
/^$/ and last SWITCH;
/^k$/ and $arg *= 1000, last SWITCH;
/^m$/ and $arg *= 1000 * 1000, last SWITCH;
/^g$/ and $arg *= 1000 * 1000 * 1000, last SWITCH;
/^t$/ and $arg *= 1000 * 1000 * 1000 * 1000, last SWITCH;
/^p$/ and $arg *= 1000 * 1000 * 1000 * 1000 * 1000, last SWITCH;
/^K$/ and $arg *= 1024, last SWITCH;
/^M$/ and $arg *= 1024 * 1024, last SWITCH;
/^G$/ and $arg *= 1024 * 1024 * 1024, last SWITCH;
/^T$/ and $arg *= 1024 * 1024 * 1024 * 1024, last SWITCH;
/^P$/ and $arg *= 1024 * 1024 * 1024 * 1024 * 1024, last SWITCH;
ldie "bad unit suffix '$mod'";
}
ldie "size argument '$orig_arg' evaluating to '$arg' is not a multiple of 4K = 4096\n" if ($arg % 4096) != 0;
return $arg;
}
# DEPRECATED
#
# TST NOTE: avoid calling this function. As such it is conceptually wrong,
# because during split-brain situations, there exists a _set_ of non-unique
# primaries. I want to remove this function, but I currently can't because
# other internal software at 1&1 is depending on it.
#
# Get actual primary node from links below actual-*/ subdirs
#
sub _get_actual_primary {
my ($res) = @_;
# TST: Presence of local device takes precedence over anything else.
# This tries to workaround the most important special case of
# split-brain situations, but cannot fix the problem exhaustively.
llog "DEPRECATED: you are trying to uniquely identify an actual primary hostname (as seen from host $host resource $res), but this is conceptually wrong because in split-brain situations there may exist multiple ones. Use view-is-primary instead. That would be safe.\n";
return $host if device_exists($res);
# The following old code is CONCEPTUALLY WRONG for split-brain situations (see NOTE above)
my @primary_links = lamport_glob("$mars/resource-$res/actual-*/is-primary");
my $primary;
foreach my $link (@primary_links) {
if (my $val = get_link($link)) {
$primary = ($link =~ qr%.*actual-([^/]+)/is-primary%)[0];
last;
# Note: if there are more than one 'is-primary' links (an insane state anyway),
# the first 'is-primary' link is selected. Other links are ignored.
}
}
return $primary;
}
my %old_primary;
sub _get_designated_primary {
my ($res, $unchecked) = @_;
my $fallback_to_old = 0;
if (defined($unchecked) && $unchecked == -1) {
$fallback_to_old = 1;
$unchecked = 1;
}
my $val = get_link("$mars/resource-$res/primary", $unchecked);
if (!defined($val) || !$val || $val eq "(none)") {
if ($fallback_to_old) {
$val = $old_primary{$res} if defined($old_primary{$res});
if (!defined($val) || !$val || $val eq "(none)") {
my $last_lnk = "$mars/resource-$res/userspace/last-primary";
$val = get_link($last_lnk, 2);
}
}
} else {
$old_primary{$res} = $val;
}
return $val;
}
sub is_actual_primary {
my ($cmd, $res, $peer) = @_;
$peer = $host unless (defined($peer) && $peer);
my $is_primary = get_link("$mars/resource-$res/actual-$peer/is-primary");
# notice: device presence _must not_ be used anymore.
return $is_primary;
}
sub __conv_tv {
my ($tv_sec, $tv_nsec) = @_;
if (defined($tv_nsec)) {
$tv_nsec = ".$tv_nsec";
} else {
$tv_nsec = "";
}
my ($sec,$min,$hour,$mday,$mon,$year,$wday,$yday,$isdst) = localtime(int($tv_sec));
return "$tv_sec$tv_nsec" unless defined($sec);
return sprintf("%04d-%02d-%02d %02d:%02d:%02d%s", $year+1900, $mon + 1, $mday, $hour, $min, $sec, $tv_nsec);
}
sub __conv_errno {
my ($txt, $error) = @_;
return "$txt$error" if !defined($error) || ($error <= 0);
$! = $error;
my $res = "${txt}[$!]";
$! = 0;
return $res;
}
sub _replace_timestamps {
my ($txt, $omit_nsec) = @_;
if (defined($omit_nsec) && $omit_nsec) {
$txt =~ s:([0-9]{9,99})\.([0-9]{9}):__conv_tv($1):ge;
} else {
$txt =~ s:([0-9]{9,99})\.([0-9]{9}):__conv_tv($1,$2):ge;
}
$txt =~ s:((error|status)\s*=?\s*-)([0-9]+):__conv_errno($1,$3):ge;
return $txt;
}
sub _get_text {
my ($glob, $regex, $do_print, $get_count) = @_;
my $text = "";
my $count = 0;
foreach my $path (lamport_glob($glob)) {
open(IN, "<", $path) or next;
while (my $line = <IN>) {
# use regex e.g. for fetching only errors and warnings
if (!$regex || $line =~ $regex) {
$line = _replace_timestamps($line);
$count++;
if ($do_print) {
print $line;
llog($line);
} else {
$text .= $line;
}
}
}
close(IN);
}
return $count if defined($get_count) && $get_count;
return $text;
}
my %global_error_texts;
sub get_error_text {
my ($cmd, $res) = @_;
my $path = "$mars/resource-$res/logstatus-$host.status";
if (defined($global_error_texts{$path})) {
return $global_error_texts{$path};
}
my $text = _get_text($path, "m/^(err|warn)/i", 0);
$global_error_texts{$path} = $text;
return $text;
}
##################################################################
# helpers
sub _trigger {
my $code = shift || 1;
if ($dry_run) {
lprint "DRY_RUN: would execute trigger '$code'\n";
return;
}
lprint "Trigger '$code'\n" if $verbose;
system("(echo -n $code > /proc/sys/mars/trigger) >/dev/null 2>&1");
}
# Please do not misuse this.
# Use it only _exceptionally_ for creation of new memberships & co.
# Always prefer the PULL PRINCIPLE where possible.
sub _push_link {
my ($peer, $src, $dst) = @_;
if (!$peer || !$src || !$dst) {
lwarn "Bad push_link args '@_'\n";
return;
}
if ($peer eq $real_host) {
set_link($src, $dst);
return;
}
my $code = "p $peer $src $dst";
lprint "Pushing link '$dst' -> '$src' to peer '$peer'\n" if $verbose;
_trigger($code);
}
# DTO do not abuse
sub _push_link_foreign {
my ($peer, $peer_ip, $src, $dst) = @_;
if (!$peer || !$src || !$dst) {
lwarn "Bad push_link args '@_'\n";
return;
}
if ($peer eq $real_host) {
set_link($src, $dst);
return;
}
$peer_ip = _get_ip($peer) unless $peer_ip;
my $code = "P $peer $peer_ip $src $dst";
lprint "Pushing link '$dst' -> '$src' to peer '$peer' '$peer_ip'\n" if $verbose;
_trigger($code);
}
sub _push_check {
my ($peer, $peer_ip, $path) = @_;
return if $peer eq $real_host;
if (!$peer || !$path) {
lwarn "Bad push_check args '@_'\n";
return;
}
$peer_ip = _get_ip($peer) unless $peer_ip;
my $code = "c $peer $peer_ip $path";
lprint "Pushing check '$path' to peer '$peer' ip='$peer_ip'\n" if $verbose;
_trigger($code);
}
sub _push_fetch {
my ($peer, $peer_ip, $dir_path) = @_;
if (!$peer || !$dir_path) {
lwarn "Bad push_fetch args '@_'\n";
return;
}
$peer_ip = _get_ip($peer) unless $peer_ip;
my $code = "f $peer $peer_ip $dir_path";
lprint "Trigger fetch '$dir_path' from peer '$peer' ip='$peer_ip'\n" if $verbose;
lprint "$code\n" if $verbose > 1;
_trigger($code);
}
sub _switch {
my ($cmd, $res, $path, $on) = @_;
my $src = $on ? "1" : "0";
$path = correct_path($path);
my $old = get_link($path, 1);
if ($old && $old eq $src) {
lprint "${cmd} on resource $res is already activated\n" if $cmd;
return;
}
set_link($src, $path);
lprint "successfully started ${cmd} on resource $res\n" if $cmd;
}
sub _writable {
my ($path, $on) = @_;
my $oldmode = (lstat $path)[2] & 0700;
my $newmode = $on ? $oldmode | 0200 : $oldmode & ~0200;
lprint "chmod '$path' $oldmode $newmode";
chmod($newmode, $path) == 1 or ldie "cannot chmod '$path'\n";
}
sub _get_ip {
my ($peer) = @_;
$peer = $host unless $peer;
check_id($peer);
# Dynamic programming: this also applies to --ip-$peer=$peer_ip
return $known_ips{$peer} if $known_ips{$peer};
# Normally, everything should be in /mars/ips/ip-*
my $ip_path = "$mars/ips/ip-$peer";
if (my $from_link = get_link($ip_path, 2)) {
lprint_stderr "Using IP '$from_link' from '$ip_path'\n" if $verbose;
$known_ips{$peer} = $from_link;
return $from_link;
}
# Try any probe data
my $probe_path = "$mars/probe-$real_host/$mars/ips/ip-$peer";
if (my $probe_link = get_link($probe_path, 2)) {
lprint_stderr "Using PROBE IP '$probe_link' from '$probe_path'\n" if $verbose;
$known_ips{$peer} = $probe_link;
return $probe_link;
}
# Try the backups in reverse order
my $backup_glob = "$mars/backups-*/ips-backup/ip-$peer";
foreach my $backup (sort reverse_cmp glob($backup_glob)) {
my $check = get_link($backup, 2);
if ($check) {
lprint_stderr "Using BACKUP IP '$check' from '$backup'\n" if $verbose;
$known_ips{$peer} = $check;
return $check;
}
}
# Try /usr/bin/host
my $answer = `/usr/bin/host -t A $peer`;
if ($answer && $answer =~ m/([0-9]+\.[0-9]+\.[0-9]+\.[0-9]+)/) {
my $addr = $1;
lprint_stderr "DNS query for '$peer' found IPv4 address '$addr'\n";
$known_ips{$peer} = $addr;
return $addr;
}
ldie "Cannot determine foreign IP for peer '$peer'\n" if $peer ne $real_host;
chomp (my @info = `/sbin/ip addr show`);
my $interface = "";
foreach my $line (@info) {
$interface = $1 if $line =~ m#^[0-9]+:\s([a-zA-Z_0-9]+):#;
next if $interface eq "lo";
if ($line =~ m#\sinet\s(\d+\.\d+\.\d+\.\d+)#) {
my $from_if = $1;
lprint_stderr "Using IP '$from_if' from interface '$interface'\n";
$known_ips{$peer} = $from_if;
return $from_if;
}
}
ldie "Cannot determine my own IP address ($real_host)\n";
}
sub _fake_versionlink {
my ($cmd, $res, $nr_links, $log_nr, $primary) = @_;
my $basedir = "$mars/resource-$res";
my $make_count = 0;
for (my $rounds = $nr_links; $rounds > 0 && $log_nr > 0; $rounds--) {
my $new_version = sprintf("$basedir/version-%09d-$host", $log_nr);
my $pri_version = sprintf("$basedir/version-%09d-$primary", $log_nr);
if ($primary eq $host) {
ldie "Cannot fake my own version link '$new_version'\n";
}
my $retry = 3;
for (;;) {
my $pri_link = get_link($pri_version, 1);
if ($pri_link) {
lprint "Creating new versionlink '$new_version' -> '$pri_link'\n";
set_link($pri_link, $new_version);
$make_count++;
last;
}
lwarn "Primary versionlink '$pri_version' does not exist\n";
return 0 if $retry-- < 0;
# During races with log-rotate, the appearace may be delayed
wait_cluster($cmd, $res, $primary, 0);
}
$log_nr--;
}
lwarn "Cannot create faked versionlink\n" if !$make_count;
return $make_count;
}
sub _set_replaylink {
my ($cmd, $res, $log_nr, $primary, $msg) = @_;
$msg = " -- THIS IS EXTREMELY RISKY -- any inconsistencies are on your own!" unless defined($msg);
ldie "no designated primary defined\n" unless ($primary && $primary ne "(none)");
my $basedir = "$mars/resource-$res";
my $rep_path = "$basedir/replay-$host";
my $rep_val = sprintf("log-%09d-$primary,0,0", $log_nr);
lprint "creating new replaylink '$rep_path' -> '$rep_val'\n";
set_link($rep_val, $rep_path);
_push_link($primary, $rep_val, $rep_path) if $primary ne $host;
my $ok = _fake_versionlink($cmd, $res, 2, $log_nr, $primary);
set_link("$log_nr$msg", "$basedir/skip-check-$host");
return $ok;
}
##################################################################
# lowlevel tools
sub lowlevel_ls_host_ips {
my ($cmd) = @_;
for my $path (lamport_glob("$mars/ips/ip-*")) {
$path =~ m:/ip-(.*):;
my $peer = $1;
my $ip = get_link($path, 1);
lprint "$peer $ip\n";
}
}
sub lowlevel_set_host_ip {
my ($cmd, $peer, $ip) = @_;
check_id($peer);
if (!$ip) {
$ip = _get_ip($peer);
}
my $path = "$mars/ips/ip-$peer";
my $old = get_link($path, 2) || "";
lprint "Set host '$peer' IP from '$old' to '$ip'\n";
set_link($ip, $path);
foreach my $peer2 (get_total_peers()) {
next if $peer2 eq $peer;
_push_link($peer2, $ip, $path);
}
}
sub lowlevel_delete_host {
my ($cmd, $peer) = @_;
check_id($peer, 1);
my $path = "$mars/ips/ip-$peer";
my $old = get_link($path);
lprint "Removing host '$peer' old IP '$old'\n";
if (!$compat_deletions) {
foreach my $peer2 (get_total_peers()) {
next if $peer2 eq $peer;
_push_link($peer2, ".deleted", $path);
}
}
_create_delete($path);
}
##################################################################
# commands
sub ignore_cmd {
my ($cmd, $res) = @_;
lprint "ignoring command '$cmd' on resource '$res'\n";
}
sub senseless_cmd {
my ($cmd, $res) = @_;
lprint "command '$cmd' makes no sense with MARS (ignoring on resource '$res')\n";
}
sub forbidden_cmd {
my ($cmd, $res) = @_;
ldie "command '$cmd' on resource '$res' cannot be used with MARS (migth affect too many hosts, lead to undesired consequences)\n";
}
sub nyi_cmd {
my ($cmd, $res) = @_;
ldie "command '$cmd' on resource '$res' is not yet implemented\n";
}
sub is_module_loaded {
return -d "/proc/sys/mars";
}
sub set_connect_pref_list {
my ($cmd, $res, $list) = @_;
check_res_member($cmd, $res);
my $dst = "$mars/resource-$res/connect-$host";
if ($cmd =~ m/^get-/) {
my $value = get_link($dst);
lprint "$value\n";
return;
}
check_id_list($list, 1);
set_link($list, $dst);
}
sub set_defaults {
my ($cmd, $res, $value) = @_;
my $key = $cmd;
$key =~ s/set-(global-)?//;
my $path = "$mars/defaults/$key";
if ($res) {
check_res_member($cmd, $res) if $res;
$path = "$mars/resource-$res/defaults/$key";
}
if ($value) {
my $flags = txt2featuresflags($cmd, $value);
lprint "Set '$path' to '$flags'\n";
set_link($flags, $path);
} else {
lprint "Delete '$path'\n";
_create_delete($path);
}
}
sub emergency_limit_res {
my ($cmd, $res, $value) = @_;
my $dst = "$mars/resource-$res/todo-$host/emergency-limit";
if ($cmd =~ m/^get-/) {
my $value = get_link($dst);
lprint "$value\n";
return;
}
ldie "percent argument '$value' isn't numeric\n" unless $value =~ m/^[0-9]+$/;
ldie "percent argument '$value' isn't between 0 and 100%\n" unless ($value >= 0 && $value <= 100);
set_link($value, $dst);
}
sub set_link_cmd {
my $cmd = shift;
for (;;) {
my $src = shift || last;
if ($cmd =~ m/^get-/) {
my $value = get_link($src);
lprint "$value\n";
next;
}
my $dst = shift || ldie "you did not supply a symlink destination for source '$src'\n";
ldie "symlink target '$dst' is not an absolute path\n" unless $dst =~ m:^/:;
check_userspace($dst);
my $dir = `dirname "$dst"` or ldie "path '$dst' has no dirname\n";
chomp $dir;
ldie "directory '$dir' does not exist\n" unless -d $dir;
set_link($src, $dst);
}
}
sub set_sync_limit_value {
my ($cmd, $value) = @_;
my $todo_dir = "$mars/defaults-$host";
ldie "directory '$todo_dir' does not exist\n" unless -d $todo_dir;
my $dst = "$todo_dir/sync-limit";
if ($cmd =~ m/^get-/) {
my $value = get_link($dst);
lprint "$value\n";
return;
}
set_link($value, $dst);
}
sub create_uuid {
my ($cmd) = @_;
my $old_uuid = get_link("$mars/uuid", 2);
ldie "Cluster was already created with uuid='$old_uuid'. " .
"For safety reasons, no override is possible at marsadm level.\n" if $old_uuid;
my $uuid = `echo -n \$(hostname) \$(date)`;
set_link($uuid, "$mars/uuid");
finish_links(); # opportunity for errors => don't continue
lprint "New cluster UUID is '$uuid'\n";
}
sub _create_dirs {
my ($cmd) = @_;
system("mkdir $mars/ips") unless -d "$mars/ips";
system("mkdir $mars/userspace") unless -d "$mars/userspace";
system("mkdir $mars/defaults") unless -d "$mars/defaults";
system("mkdir $mars/defaults-$host") unless -d "$mars/defaults-$host";
system("mkdir $mars/todo-global") unless -d "$mars/todo-global";
mkdir("$mars/actual-$host") unless -d "$mars/actual-$host";
set_link($marsadm_version, "$mars/actual-$host/marsadm-version");
}
sub _create_cluster {
my ($cmd) = @_;
ldie "The $mars directory does not exist.\n" unless -d $mars;
my $ip = _get_ip($host);
_create_dirs($cmd);
create_uuid(@_) if $cmd eq "create-cluster";
set_link("0", "$mars/defaults-$host/sync-limit");
set_link("(none)", "$mars/defaults-$host/sync-pref-list");
set_link($ip, "$mars/ips/ip-$host");
set_link("1", "$mars/todo-global/deleted-$host");
}
sub create_cluster {
my ($cmd, $peer) = @_;
ldie "cluster is already created\n" if !$force && -d "$mars/ips";
ldie "mars module is loaded, please unload first\n" if is_module_loaded();
_create_cluster(@_);
}
sub join_cluster {
my ($cmd, $peer, $peer_ip) = @_;
ldie "Cannot join myself (peer='$peer', host='$host')\n" if $peer eq $host;
ldie "Directory $mars is missing\n" unless -d $mars;
if (lamport_glob("$mars/resource-*")) {
lwarn "DANGER: some resources already exist!\n";
ldie "DANGER: If you are sure that no resource clash is possible, re-invoke $cmd with '--force' option\n" unless $force;
}
_create_dirs($cmd);
$peer_ip = _get_ip($peer) unless $peer_ip;
# try new join method
if (is_module_loaded()) {
my $ip = _get_ip($host);
lprint "MARS kernel module is loaded, trying the new $cmd method.\n";
my $old_uuid = get_link("$mars/uuid", 2);
if (!$old_uuid || $old_uuid eq "(any)") {
mkdir("$mars/ips") unless -d "$mars/ips";
mkdir("$mars/todo-global") unless -d "$mars/todo-global";
mkdir("$mars/defaults-$host") unless -d "$mars/defaults-$host";
mkdir("$mars/actual-$host") unless -d "$mars/actual-$host";
set_link("(any)", "$mars/uuid") unless $old_uuid;
set_link($ip, "$mars/ips/ip-$host");
my $peer_known = link_exists("$mars/ips/ip-$peer");
if (!$peer_known) {
set_link($peer_ip, "$mars/ips/ip-$peer");
}
# make PRELIMINARY links, timestamp is 1 second after the Big Bang (1970)
finish_links(1);
_trigger();
lprint "Update local '$real_host' information\n";
update_cluster($cmd, "all", $real_host);
if ($peer_known) {
lprint "Update peer '$peer' information\n";
update_cluster($cmd, "all", $peer);
}
# write again, this time with current mars_time()
set_link($ip, "$mars/ips/ip-$host");
set_link("0", "$mars/defaults-$host/sync-limit");
set_link("(none)", "$mars/defaults-$host/sync-pref-list");
my $deleted_path = "$mars/todo-global/deleted-$host";
set_link("1", $deleted_path) unless link_exists($deleted_path);
finish_links();
_push_link($peer, $ip, "$mars/ips/ip-$host");
lprint "Checking uuid\n";
my $round = 0;
my $new_uuid = get_link("$mars/uuid", 1);
while ($round++ < 5) {
_trigger(3);
sleep(3);
lprint "... update from $peer round $round\n";
_reset_resources();
update_cluster($cmd, "all", $peer);
$new_uuid = get_link("$mars/uuid");
last if ($new_uuid && $new_uuid ne "(any)");
_push_link($peer, $ip, "$mars/ips/ip-$host");
}
if ($new_uuid && $new_uuid ne "(any)") {
lprint "Successfully joined cluster, uuid='$new_uuid'\n";
return;
} else {
lwarn "Unable to determine the remote cluster uuid.\n";
}
} else {
lwarn "Cluster uuid '$old_uuid' already present, cannot use the new $cmd method\n";
}
}
lprint "Falling back to the old ssh/rsync based $cmd method (peer='$peer' peer_ip='$peer_ip')\n";
ldie "OLD method: MARS module is loaded, please unload first before using ssh\n" if is_module_loaded();
my $ip = _get_ip($host);
rsync_cmd($peer, "--max-size=1 --update $peer:$mars/ $mars/");
# check uniqness of IPs
my %ips = ();
foreach my $other_ip_path (lamport_glob("$mars/ips/*")) {
my $other_ip = get_link($other_ip_path, 1);
my $other_host = $other_ip_path;
$other_host =~ s:.*/ip-::;
lwarn "Host '$other_host' IP '$other_ip' is AMBIGUOUS\n" if defined($ips{$other_host});
$ips{$other_host} = $other_ip;
lwarn "New IP '$ip' already exists for host '$host' at '$other_ip_path'\n" if ($ip eq $other_ip && $other_host eq $host);
}
_create_cluster(@_);
finish_links();
rsync_cmd($peer, "--update $mars/ips/ $peer:$mars/ips/");
}
sub _get_probe {
my ($probe_dir, $peer, $peer_ip) = @_;
$peer_ip = _get_ip($peer) unless $peer_ip;
mkdir($probe_dir);
my $new_mars = "$probe_dir/$mars";
_push_fetch($peer, $peer_ip, $probe_dir);
return $new_mars;
}
# allow accumulation data from multiple peers
sub _wait_probe {
my ($probe_dir, $peer, $peer_ip) = @_;
my $got_path = "$probe_dir/got-$peer";
lprint "Wait for probe data from '$peer' in '$got_path'\n";
while (!link_exists($got_path)) {
lprint "Wait for '$got_path' to appear\n";
sleep_timeout(5);
last if link_exists($got_path);
_push_fetch($peer, $peer_ip, $probe_dir);
}
lprint "Got probe data from '$peer' in '$got_path'\n";
}
sub merge_cluster {
my ($cmd, $peer, $peer_ip) = @_;
my $uuid = readlink("$mars/uuid");
my @resources = lamport_glob("$mars/resource-*");
my @ip_links = lamport_glob("$mars/ips/*");
if ($cmd =~ m/-list/) {
print "UUID: $uuid\n";
print "IPs:\n";
foreach my $ip (@ip_links) {
print "$ip\n";
}
print "RESOURCEs:\n";
foreach my $i (@resources) {
print "$i\n";
}
return;
}
ldie "No peer argument given" unless $peer;
ldie "Cannot merge myself (peer='$peer', host='$host')\n" if $peer eq $host;
ldie "Directory $mars is missing\n" unless -d $mars;
ldie "A cluster UUD '$mars/uuid' does not exist. Please use 'join-cluster instead.\n" unless link_exists("$mars/uuid");
# start_deprecated_CODE
if (!ssh_cmd($peer, $ssh_probe, 1)) {
lprint "Falling back to OLD ssh-based $cmd\n";
merge_cluster_old(@_);
return;
}
lprint "Using NEW $cmd without ssh\n";
# end_deprecated_code
if ($peer_ip) {
lprint "Given on command line: peer '$peer' has IP '$peer_ip'\n";
} else {
$peer_ip = _get_ip($peer);
ldie "Cannot determine IP for peer '$peer'. Please give the IP on the command lime.\n" unless $peer_ip;
lprint "Determined: peer '$peer' appears to have IP '$peer_ip'\n";
}
my $peer_ip_path = "$mars/ips/ip-$peer";
my $probe_ip = get_link($peer_ip_path, 1);
unless ($probe_ip) {
finish_links();
lprint "Need to set preliminary peer IP\n";
set_link($peer_ip, $peer_ip_path);
# Make PRELIMINARY links, timestamp is 1 second after the Big Bang (1970)
finish_links(1);
_trigger(3);
}
my $probe_dir = "$mars/probe-$real_host";
system("rm -rf $probe_dir");
my $new_mars = _get_probe($probe_dir, $peer, $peer_ip);
_wait_probe($probe_dir, $peer, $peer_ip);
my %ips;
my %old_peers;
my %merge_peers;
foreach my $old_peer (get_total_peers()) {
$ips{$old_peer} = _get_ip($old_peer);
$old_peers{$old_peer} = 1;
$merge_peers{$old_peer} = 1;
lprint " old peer '$old_peer' ip='$ips{$old_peer}'\n";
}
foreach my $ip_path (lamport_glob("$new_mars/ips/ip-*")) {
my $ip = get_link($ip_path, 1);
next unless $ip;
my $new_peer = $ip_path;
$new_peer =~ s:^.+/ip-::;
next if $merge_peers{$new_peer};
$ips{$new_peer} = _get_ip($new_peer);
$merge_peers{$new_peer} = 1;
lprint " new peer '$new_peer' ip='$ips{$new_peer}'\n";
}
my %new_res;
foreach my $ip (lamport_glob("$new_mars/resource-*")) {
my $new_res = $ip;
$new_res =~ s:^.+/resource-::;
$new_res{$new_res} = 1;
}
my $new_uuid = readlink("$new_mars/uuid");
ldie "Other cluster peer '$peer' has no UUID\n" unless $new_uuid;
my %old_res;
foreach my $str (get_total_resources()) {
$old_res{$str} = 1;
}
if ($new_uuid eq $uuid) {
lprint "Other cluster peer '$peer' has the same UUID.\n";
lprint "No resource name checking necessary.\n";
lprint "Operation '$cmd' will work logically idempotent.\n";
} else {
# Check that both sets of resources are logically disjoint
lprint "Other cluster peer '$peer' has a different UUID, checking for resource name conflicts.\n";
my %inter_res = key_intersect(\%old_res, \%new_res);
my $conflicts = 0;
foreach my $res (keys(%inter_res)) {
lwarn "common resource '$res'\n";
$conflicts++;
}
ldie "cannot merge cluster '$peer' due to $conflicts conflicts\n" if $conflicts;
}
lprint "MERGE '$peer'\n";
foreach my $new_peer1 (keys(%merge_peers)) {
lprint "fetch '$new_peer1'\n";
_push_fetch($new_peer1, $ips{$new_peer1}, "/");
foreach my $new_peer2 (keys(%merge_peers)) {
lprint " push '$new_peer2'\n";
my $ip_path = "$mars/ips/ip-$new_peer2";
_push_link_foreign($new_peer1, $ips{$new_peer1}, $ips{$new_peer2}, $ip_path);
}
}
system("rm -rf $probe_dir");
}
# deprecated, to disappear
sub merge_cluster_old {
my ($cmd, $peer) = @_;
ldie "ssh is disabled\n" unless $ssh_port;
my $uuid = readlink("$mars/uuid");
my @resources = lamport_glob("$mars/resource-*");
my @ip_links = lamport_glob("$mars/ips/*");
# check connections
my $check_cmd = "uname -a";
system("$check_cmd") == 0 or ldie "oops, 'uname is not installed'\n";
system("rsync --help > /dev/null") == 0 or ldie "Command 'rsync' is not installed\n";
ssh_cmd($peer, $ssh_probe);
my @old_peers;
foreach my $ip (@ip_links) {
my $old_peer = $ip;
$old_peer =~ s:^.+/ip-::;
next if $old_peer eq $host;
next if $old_peer eq $real_host;
ssh_cmd($old_peer, $ssh_probe);
push @old_peers, $old_peer;
}
# check whether merge-cluster is possible
my %total_res;
foreach my $res (@resources) {
$total_res{$res}++;
}
my $ssh_cmd = make_ssh_cmd($peer) . " marsadm merge-cluster-list";
my $answer = `$ssh_cmd`;
$answer =~ m/^UUID: (.*)$/m or ldie "cannot determine remote UUID from '$answer'\n";
my $other_uuid = $1;
ldie "Other cluster peer '$peer' has no UUID\n" unless $other_uuid;
if ($other_uuid eq $uuid) {
lprint "Other cluster peer '$peer' has the same UUID.\n";
lprint "No resource name checking necessary.\n";
lprint "Operation '$cmd' will therfore work logically idempotent.\n";
} else {
if (link_exists("$mars/tree-$peer")) {
lwarn "A valid tree signature '$mars/tree-$peer' already exists, thus it appears to be already merged!\n";
ldie "Aborting for saftey. Override via --force only if you know what you are doing!\n" unless $force;
}
# Check that both sets of resources are disjoint
lprint "Other cluster peer '$peer' has a different UUID, checking for resource name conflicts.\n";
my @other_resources;
my @conflicts;
my $copy = $answer;
$copy =~ s/\A.*?RESOURCEs:\n//ms;
while ($copy) {
$copy =~ s/\A(.*)\n$//m;
my $other_res = $1;
last unless $other_res;
push @other_resources, $other_res;
if ($total_res{$other_res}++) {
push @conflicts, $other_res;
}
}
if (@conflicts) {
lprint "CONFLICTS:\n";
foreach my $res (@conflicts) {
lprint "\t$res\n";
}
ldie "Cannot $cmd: some resource directories exist at both clusters with same name.\nThis cannot be overriden.\nPlease resolve the conflict by hand.\n";
}
lprint "List of total resources:\n";
foreach my $res (keys(%total_res)) {
lprint "\t$res\n";
}
# INTERNAL, for debugging and error analysis: backup the old uuid symlink
my $backup = "$backup_dir/uuid-backups";
system("mkdir -p $backup; cp -a $mars/uuid $backup/") unless link_exists("$backup/uuid");
}
# Start the "hot phase"
my $rsync_cmd = "--max-size=1";
rsync_cmd($peer, "$rsync_cmd $peer:$mars/uuid $mars/uuid");
foreach my $old_peer (@old_peers) {
rsync_cmd($old_peer, "$rsync_cmd $mars/uuid $old_peer:$mars/uuid");
}
$rsync_cmd .= " --update --ignore-existing";
rsync_cmd($peer, "$rsync_cmd $peer:$mars/ $mars/");
rsync_cmd($peer, "$rsync_cmd $mars/ $peer:$mars/");
}
sub leave_cluster {
my ($cmd) = @_;
ldie "mars kernel module is not loaded. This is needed for communication with some other hosts!\n" if !is_module_loaded();
my $check = "/mars/resource-*/data-$host";
ldie "I am member of some resources\n" if lamport_glob($check) && !$force;
wait_cluster($cmd) unless $force;
foreach my $path (lamport_glob("$mars/actual-*/*-$host")) {
_create_delete($path);
}
_create_delete("$mars/ips/ip-$host");
finish_links();
wait_cluster($cmd) unless $force;
foreach my $path (lamport_glob("$mars/actual-*/*-$host")) {
_create_delete($path);
}
_create_delete("$mars/todo-global/deleted-$host");
finish_links();
wait_cluster($cmd) unless $force;
while (-f "$mars/ips/ip-$host" && $timeout_val >= 0) {
sleep_timeout(3, 1);
}
system("rmmod mars") if $host eq $real_host;
foreach my $path (lamport_glob("$mars/{,resource-*/}{todo,actual}-*/*-$host")) {
unlink($path);
}
foreach my $path (lamport_glob("$mars/{,resource-*/}{todo,actual}-$host")) {
rmdir($path); # at least try it
}
}
sub create_res {
my ($cmd, $res, $dev, $appear, $size_arg) = @_;
my $create = ($cmd eq "create-resource");
ldie "undefined device or size argument\n" unless $dev;
$appear = $res if !$appear;
check_id($appear) if $appear;
my $resdir = "$mars/resource-$res";
if ($create) {
if (-d $resdir) {
lwarn "resource directory '$res' already exists\n";
my @host_list = lamport_glob("$resdir/replay-*");
if (@host_list) {
my $h_list = join(',', map({ $_ =~ s:.*/replay-::; $_; } (@host_list)));
lwarn "DANGER: hosts '$h_list' are already member of resource '$res'.\n";
ldie "REFUSING to trash your resource!\n" unless $force;
}
}
lprint "creating new resource '$res'\n";
} else {
if (link_exists("$resdir/data-$host")) {
lwarn "resource '$res' has been already joined -- this is dangerous!\n";
ldie "refusing dangerous operation\n" unless $force;
} else {
lprint "joining to existing resource '$res'\n";
}
}
my $size = 0;
if (-b $dev) {
ldie "block device '$dev' must be an absolute path starting with '/'\n" unless $dev =~ m/^\//;
use Fcntl 'SEEK_END', 'O_RDONLY', 'O_RDWR', 'O_EXCL';
my $flags = O_RDWR | O_EXCL;
if ($force) {
$flags = O_RDONLY;
}
sysopen(TEST, $dev, $flags) or ldie "cannot open device '$dev' for exclusive rw access\n";
$size = sysseek(TEST, 0, SEEK_END);
close(TEST);
lprint "block device '$dev': determined size = $size bytes\n";
if ($size_arg) {
my $new_size = get_size($size_arg);
ldie "size argument '$size_arg' is smaller than device size '$size'\n" unless $new_size <= $size;
lprint "reducing size from $size to $new_size\n";
$size = $new_size;
}
} elsif (-f $dev) {
lprint "Using regular (sparse) file '$dev' as an image file\n";
$size = get_size($dev);
if ($size > 0) {
$dev = "";
} else {
ldie "bad parameter '$dev': must be either a block device name, or size followed by k or m or g or t\n";
}
} else {
ldie "Device argument '$dev' does not exist or is no usable block device.\n"
}
ldie "implausible size $size" unless $size > 4096 * 16; # smaller floppies should not exist ;)
# check for uniqeness of $appear
if ($appear) {
foreach my $old_dev (lamport_glob("$mars/resource-*/device-$host")) {
$old_dev =~ m:/resource-([^/]+)/:;
my $old_res = $1;
next if (!$old_res || $old_res eq $res || $old_res eq "(none)");
my $old_name = get_link($old_dev);
if ($old_name eq $appear) {
if (link_exists("$mars/resource-$old_res/data-$host")) {
ldie "Device name '/dev/mars/$old_name' already exists at resource '$old_res'\n";
} else {
lwarn "Device name '/dev/mars/$old_name' already exists in another resource '$old_res'.\n";
lwarn "This does no harm, but may be confusing.\n";
lwarn "Please name your devices equal to the resource names by convention.\n";
}
}
}
# warn if devices are named differently throughout the cluster
foreach my $old_dev (lamport_glob("$resdir/device-*")) {
my $old_name = get_link($old_dev);
next unless $old_name;
next if $old_name eq "(none)";
if ($old_name ne $appear) {
$old_dev =~ m:/device-(.+)$:;
my $old_host = $1;
next unless $old_host;
lwarn "your name '/dev/mars/$appear' differs from '/dev/mars/$old_name' on host '$old_host'.\n";
lwarn "this does no harm, but may be confusing.\n";
}
}
}
err_purge_res($cmd, $res);
if (!$create) {
_activate_resource($cmd, $res, $host);
}
my $max_primary_retry = 5;
primary_retry:
my $primary;
my $replay_nr = -1;
my $use_rsync = 0;
if ($create) {
mkdir($resdir);
ldie "could not create resource '$res'\n" unless -d $resdir;
set_link($size, "$resdir/size");
} else { # join
my $ip = _get_ip($host);
# For safety, try to get the very newest infos.
# Reason: newer kernel modules will fetch non-member resource infos less frequently.
# Therefore we shift some responsibility for non-member -> member transitions to userspace.
$primary = _get_designated_primary($res, 1);
for (my $retry = 0; $retry < 3; $retry++) {
last if (defined($primary) && $primary);
lprint "Trying to get primary info\n";
my $condition = "return _get_designated_primary(\"$res\", 1);";
$primary = update_cluster($cmd, $res, "*", $condition);
}
my $max_retry = 5;
for (;;) {
$primary = _get_designated_primary($res, 1);
last if $max_retry-- < 0;
if ($primary && $primary ne $host && $primary ne "(none)") {
# first check whether symlink information appears to be recent enough
last if recent_cluster($cmd, $res, $primary);
# for safety, fetch newest infos from last known primary
if ($max_retry >= 3) {
lprint "Trying update-cluster from '$primary'\n";
update_cluster($cmd, $res, $primary);
next;
} elsif ($max_retry >= 1) {
lprint "Trying full update-cluster\n";
update_cluster($cmd, $res);
next;
}
$use_rsync++;
rsync_cmd($primary, "--max-size=1 --update $primary:$mars/resource-$res/ $mars/resource-$res/", $res, 1);
next;
}
lprint "Using update-cluster for primary determination\n";
update_cluster($cmd, $res);
next if $max_retry > 2;
my @peers = map { m:^$mars/ips/ip-(.+):; $1 } lamport_glob("$mars/ips/ip-*");
if (!@peers || $max_retry < 0) {
lwarn "cannot get a relevant primary\n";
last;
}
# We are desperate. Try to fetch the directory from anywhere.
my $peer = shift @peers;
next if $peer eq $host;
next if $peer eq $real_host;
$use_rsync++;
rsync_cmd($peer, "--max-size=1 --update $peer:$mars/resource-$res $mars/", 1);
} # retry loop
ldie "resource '$res' does not exist\n" unless -d $resdir;
ldie "resource '$res' has no designated primary\n" unless $primary && $primary ne "(none)";
finish_links();
wait_cluster($cmd, $res);
my $res_size = get_link("$mars/resource-$res/size", 1);
if ($size < $res_size) {
lwarn "size of new device is only $size, but should be $res_size\n";
ldie "refusing to join due to bad size\n" unless $force;
} elsif ($size > $res_size) {
lprint "Your physical device has size $size, which is larger than the logical resource size $res_size.\n";
lprint "This does no harm, but you are wasting some space.\n";
}
$primary = _get_designated_primary($res);
ldie "Sorry, joining is only possible if a designated primary exists.\n" if $primary eq "(none)";
ldie "implausible state: I ($host) am already designated primary of resource '$res' which I just wanted to join\n" if $primary eq $host;
ldie "my ip '$ip' is not registered -- please run 'join-cluster' first\n" unless link_exists("$mars/ips/ip-$host");
check_sync_startable(@_);
my $replay = get_link("$resdir/replay-$primary");
if ($replay =~ m/^log-([0-9]+)-/) {
$replay_nr = $1;
$replay_nr =~ s/^0+//;
} else {
lwarn "'$res' cannot determine current primary '$primary' logfile number from '$replay'.\n";
if ($max_primary_retry-- >= 0) {
wait_cluster($cmd, $res);
goto primary_retry;
}
ldie "Primary '$primary' info is missing for '$res'.\n";
}
}
# purge before complaining
log_purge_res($cmd, $res);
err_purge_res($cmd, $res);
finish_links();
_wait_delete();
# check for remains from former incidents
my $max_retry = 3;
while ($max_retry-- > 0) {
my @remains = ();
foreach my $remain (lamport_glob("$resdir/{log,version}-*-$host")) {
if (!$create) {
$remain =~ m/-([0-9]+)-$host/;
my $nr = $1;
next if $nr < $replay_nr;
}
lwarn "OLD REMAIN '$remain'\n";
push @remains, $remain;
}
last if !@remains;
lwarn "Resource directory has some old remains.\n";
if ($force && !$create) {
foreach my $remain (@remains) {
lwarn "REMOVING remain '$remain'";
_create_delete($remain);
}
finish_links();
_wait_delete();
next;
}
lwarn "First cleanup.\n";
my $extra_op = $create ? "delete-resource" : "log-purge-all";
lwarn "If you really want to $cmd, run 'marsadm $extra_op' first.\n";
ldie "REFUSING $cmd\n" unless $force;
}
my $file = "$resdir/data-$host";
if (!$dev) {
lwarn "link '$file' already exists - reusing\n" if link_exists($file);
lprint "setup sparse file '$file' with size $size\n";
open(OUT, ">>", $file) or ldie "could not open '$file'\n";
truncate(OUT, $size) or ldie "truncate to size $size failed\n";
close OUT;
} else {
lprint "using existing device '$dev'\n";
set_link($dev, $file);
_push_link($primary, $dev, $file) if $primary;
}
my $max_join_retry = 5;
join_retry:
if ($appear) {
lprint "resource '$res' will appear as local device '/dev/mars/$appear'\n";
my $device_path = "$resdir/device-$host";
set_link($appear, $device_path);
_push_link($primary, $appear, $device_path) if $primary;
}
mkdir("$mars/userspace") unless -d "$mars/userspace";
mkdir("$resdir/userspace") unless -d "$resdir/userspace";
mkdir("$resdir/defaults") unless -d "$resdir/defaults";
mkdir("$resdir/defaults-$host");
mkdir("$resdir/local-$host");
mkdir("$resdir/actual-$host");
set_link("0", "$resdir/actual-$host/if-on");
my $todo = "$resdir/todo-$host";
mkdir($todo);
set_link("0", "$todo/attach");
set_link("1", "$todo/connect");
set_link("1", "$todo/sync");
set_link("1", "$todo/allow-replay");
unlink("$resdir/syncstatus-$host");
my $replay_ok = 1;
if ($create) {
set_link($host, "$resdir/primary");
set_link($host, "$resdir/userspace/last-primary");
set_link($size, "$resdir/syncstatus-$host");
my $startnr = get_link("$resdir/maxnr", 2);
if (defined($startnr) && $startnr ne "" && $startnr > 0) {
$startnr += 100000;
} else {
$startnr = 1;
}
my $fmt_old = sprintf("%09d", $startnr - 1);
my $fmt = sprintf("%09d", $startnr);
set_link("log-$fmt-$host,0,0", "$resdir/replay-$host");
my $logfile = "$resdir/log-$fmt-$host";
safe_creat($logfile) unless $dry_run;
my $old_fake = "00000000000000000000000000000000,log-$fmt_old-$host,0";
set_link("$old_fake:", "$resdir/version-$fmt_old-$host");
set_link("00000000000000000000000000000000,log-$fmt-$host,0:$old_fake", "$resdir/version-$fmt-$host");
set_link("$startnr", "$resdir/skip-check-$host") if $startnr > 1;
set_link("$startnr", "$resdir/maxnr");
finish_links();
lprint "successfully created resource '$res'\n";
} else { # join
$replay_ok = _set_replaylink($cmd, $res, $replay_nr, $primary, "");
set_link("0", "$resdir/syncstatus-$host");
finish_links();
_trigger(3);
if ($use_rsync && $ssh_port) {
rsync_cmd($primary, "--max-size=1 --update $file $primary:$mars/resource-$res/", 1);
}
wait_cluster($cmd, $res, $primary);
}
set_link("1", "$todo/attach");
finish_links();
_trigger(3);
_reset_resources();
if (!$create) {
# Check for races with logrotate
for (my $check_redo = 3; $check_redo > 0; $check_redo--) {
wait_cluster($cmd, $res, $primary, 0);
# Split brain or orphanness can happen due to _inherent_ races
# with fresh appearance of the new replaylink.
lprint "Checking '$res' for split brain\n";
%detected_splits = ();
my $split_ok = detect_splitbrain($res, 1);
lprint "redo=$check_redo resource '$res' split_ok=$split_ok replay_ok=$replay_ok\n";
if (!$split_ok || !$replay_ok) {
lwarn "Race '$res' with log-rotate detected.\n";
if ($max_join_retry-- >= 0) {
wait_cluster($cmd, $res, "*", 0);
invalidate_res_phase1($cmd, $res);
wait_cluster($cmd, $res, $host, 0);
invalidate_res_phase2($cmd, $res);
lwarn "Restarting '$res' due to unstable situation at primary '$primary'\n";
goto join_retry;
}
}
}
lprint "Successfully joined resource '$res' to primary '$primary'\n";
}
_systemd_trigger($cmd);
}
sub _fetch_transitive_peers {
lprint "Computing the transitive closure of peers:\n";
my @peers = get_total_peers();
my @old_peers;
my $probe_dir = "$mars/probe-$real_host";
system("rm -rf $probe_dir");
while (scalar(@peers) > scalar(@old_peers)) {
lprint "Current #peers: " . scalar(@peers) . "\n";
@old_peers = @peers;
foreach my $peer (@peers) {
next if $peer eq $real_host;
_get_probe($probe_dir, $peer);
}
foreach my $peer (@peers) {
next if $peer eq $real_host;
_wait_probe($probe_dir, $peer);
}
_reset_resources();
@peers = get_total_peers();
}
lprint "Final #peers: " . scalar(@peers) . "\n";
}
sub split_cluster {
my ($cmd) = @_;
# start_deprecated_CODE
foreach my $peer (get_total_peers()) {
next unless $ssh_port;
next if $peer eq $real_host;
if (!ssh_cmd($peer, $ssh_probe, 1)) {
lprint "Falling back to OLD ssh-based $cmd\n";
split_cluster_old(@_);
return;
}
}
lprint "Using NEW $cmd without ssh\n";
# end_deprecated_code
my $probe_dir = _fetch_transitive_peers();
system("rm -rf $probe_dir");
lprint "IPS:\n";
my @peers = get_total_peers();
foreach my $peer (@peers) {
my $ip = _get_ip($peer);
lprint " Peer='$peer' IP='$ip'\n";
}
lprint "PEER GROUPS:\n";
my $nr = 0;
my %touched;
my %peer_groups;
foreach my $peer1 (@peers) {
next if ($touched{$peer1});
my %members;
$members{$peer1} = 1;
foreach my $res (get_total_resources($peer1)) {
lprint " peer '$peer1' res '$res'\n" if $verbose > 2;
my @peers2 = get_total_peers($res);
foreach my $peer2 (@peers2) {
next if $members{$peer2};
lprint " peer '$res' res '$res' '$peer2'\n" if $verbose > 2;
$touched{$peer2} = 1;
$members{$peer2} = 1;
}
}
$peer_groups{$peer1} = \%members;
$nr++;
lprint " #$nr: $peer1 => " . join(",", sort alphanum_cmp keys(%members)) . "\n";
}
if ($nr <= 1) {
lprint "Nothing to do: there are only $nr group(s)\n";
system("rm -rf $probe_dir");
return;
}
# PHASE 0: create emergency backup
unless ($dry_run) {
my $ips_backup = "$backup_dir/ips-backup";
lprint "Phase 0: create backup at $ips_backup\n";
system("mkdir -p $ips_backup") and ldie "Cannot create '$ips_backup'\n";
system("cp -a $mars/ips/* $ips_backup/");
}
# PHASE 1: delete all
lprint "Phase 1: delete all IPs\n";
foreach my $peer (@peers) {
my $ip_path = "$mars/ips/ip-$peer";
_create_delete($ip_path);
}
finish_links();
_wait_delete();
_reset_resources();
foreach my $peer (@peers) {
my $ip_path = "$mars/ips/ip-$peer";
foreach my $targ (@peers) {
my $targ_ip = _get_ip($targ);
_push_link_foreign($targ, $targ_ip, ".deleted", $ip_path);
}
}
finish_links();
_reset_resources();
lprint "Wait for effect\n";
sleep(10);
# PHASE 2: regenerate all groups
lprint "Phase 2: re-create all groups\n";
foreach my $group (sort alphanum_cmp keys(%peer_groups)) {
my $group_ip = _get_ip($group);
lprint " GROUP '$group' ip '$group_ip'\n" if $verbose;
my $list = $peer_groups{$group};
my %members = %$list;
foreach my $peer (sort alphanum_cmp keys(%members)) {
my $ip_path = "$mars/ips/ip-$peer";
my $ip = _get_ip($peer);
lprint " PEER '$peer' '$ip'\n" if $verbose;
foreach my $targ (sort alphanum_cmp keys(%members)) {
my $targ_ip = _get_ip($targ);
_push_link_foreign($targ, $targ_ip, $ip, $ip_path);
}
}
}
finish_links();
system("rm -rf $probe_dir");
_reset_resources();
}
# deprecated, to disappear
sub split_cluster_old {
my ($cmd) = @_;
ldie "ssh is disabled\n" unless $ssh_port;
# checks
lprint "Checking...\n";
my $ips_backup = "$backup_dir/ips-backup";
system("mkdir -p $ips_backup") and ldie "OOps";
system("cp -a $mars/ips/* $ips_backup/");
foreach my $peer (lamport_glob("$ips_backup/ip-*")) {
$peer =~ s:^$ips_backup/ip-::;
ssh_cmd($peer, $ssh_probe);
}
lprint "Creating IP backups...\n";
# create ips backup
foreach my $peer (lamport_glob("$ips_backup/ip-*")) {
$peer =~ s:^$ips_backup/ip-::;
next if $peer eq $host;
rsync_cmd($peer, "--max-size=1 --update $peer:$mars/ips/ $ips_backup/", 1);
}
system("cp -a $mars/ips/* $ips_backup/");
my @peers = map { $_ =~ s:^$ips_backup/ip-::; $_ } (lamport_glob("$ips_backup/ip-*"));
ldie "Empty peer list\n" unless @peers;
foreach my $peer (@peers) {
next if $peer eq $host;
rsync_cmd($peer, "--max-size=1 --update $ips_backup $peer:$backup_dir/");
}
# Check symmetry
lprint "Checking symmetry of resource memberships...\n";
my $data_cmd = "ls $mars/resource-*/data-*";
my $standard = qx:$data_cmd:;
ldie "Cannot get local resource participants.\n" unless $standard;
foreach my $peer (@peers) {
next if $peer eq $host;
my $ssh_cmd = make_ssh_cmd($peer) . " '$data_cmd'";
my $compare = qx:$ssh_cmd:;
ldie "Asymmetric $mars/resource-*/data-* with peer $peer\n" unless $compare eq $standard;
}
# delete all hosts everywhere
lprint "Deleting all peers IPs everywhere...\n";
foreach my $peer (@peers) {
print "--- peer $peer\n";
my $cmd = "rm -rf $mars/ips";
ssh_cmd($peer, $cmd, 1);
}
# wait until all peer threads are gone
lprint "Waiting...\n";
sleep(10);
my $cond_cmd = "(( \$(ps ax | grep mars_peer | grep -v grep | wc -l) == 0 ))";
for (my $loop = 0; $loop < $timeout; $loop++) {
my $running = 0;
foreach my $peer (@peers) {
my $status = ssh_cmd($peer, $cond_cmd, 1);
$running++ if $status;
}
last if !$running;
print "--- $running peers have running peer threads.\n";
sleep(1);
}
for (my $loop = 0; $loop < 3; $loop++) {
# re-create all groups everywhere
lprint "RE-CREATE peer IPs...\n";
foreach my $peer (@peers) {
print "--- peer $peer\n";
my $cmd = "rm -f $mars/todo-global/delete* $mars/actual-*/msg-*connection-*; ";
$cmd .= "shopt -s nullglob; ";
$cmd .= "for i in $mars/resource-*; do if ! [[ -e \$i/data-$peer ]] && ! [[ -e \$i/replay-$peer ]]; then rm -rf $backup_dir/\${i##*/}; mv \$i $backup_dir/; fi; done; ";
$cmd .= "mkdir -p $mars/ips; ";
my $sub_list = "{ for dir in $mars/resource-*/data-$peer; do (cd \${dir%/*} && for i in data-*; do echo \$i; done); done; echo x-$peer; }";
my $sub_cmd = "echo RESTORE IP \$j; cp -a $ips_backup/ip-\$j $mars/ips/";
$cmd .= "for j in \$($sub_list | cut -d- -f2- | sort -u); do $sub_cmd; done";
lprint "$cmd\n";
ssh_cmd($peer, $cmd, 1);
}
}
}
sub leave_res_phase0 {
my ($cmd, $res) = @_;
check_not_primary($cmd, $res, 1);
my $errors = 0;
foreach my $tmp (lamport_glob("$mars/resource-$res/todo-$host/*")) {
next if $tmp =~ m:/delete:;
my $status = get_link($tmp, 2);
if ($status) {
lwarn "switch '$tmp' is not off\n";
$errors++;
}
}
foreach my $tmp (lamport_glob("$mars/resource-$res/actual-$host/{is-,logfile-}*")) {
my $status = get_link($tmp);
if ($status) {
lwarn "running status '$tmp' is not off\n";
$errors++;
}
}
if (!$force) {
check_status($cmd, $res, "is-attached", 0, 0, 1);
ldie "there were $errors errors.\n" if $errors;
}
}
sub leave_res_phase1 {
my ($cmd, $res) = @_;
set_link("1", "$mars/resource-$res/work-$host");
_create_delete("$mars/resource-$res/replay-$host");
_create_delete("$mars/resource-$res/data-$host");
_create_delete("$mars/resource-$res/syncstatus-$host");
my $syncpos = "$mars/resource-$res/syncpos-$host";
_create_delete($syncpos) if link_exists($syncpos);
my $skip_check = "$mars/resource-$res/skip-check-$host";
_create_delete($skip_check) if link_exists($skip_check);
my $vstatus = "$mars/resource-$res/verifystatus-$host";
_create_delete($vstatus) if link_exists($vstatus);
_create_delete("$mars/resource-$res/device-$host");
_create_delete("$mars/resource-$res/actsize-$host");
foreach my $dir (lamport_glob("$mars/resource-$res/*-$host/")) {
foreach my $tmp (lamport_glob("${dir}*")) {
_create_delete($tmp);
}
_create_delete($dir);
}
finish_links();
}
# wait for deletions (avoid races with following commands)
sub leave_res_phase2 {
my ($cmd, $res) = @_;
_wait_delete();
_reset_resources();
$force = 0; # this would be too dangerous
log_purge_res($cmd, $res);
err_purge_res($cmd, $res);
}
sub leave_res_phase3 {
my ($cmd, $res) = @_;
_wait_delete();
_reset_resources();
err_purge_res($cmd, $res);
# deprecated: hindering for new deletion method
if ($compat_deletions && $host eq $real_host) {
system("rm -f $mars/resource-$res/log-*");
}
_systemd_trigger($cmd);
}
sub delete_res {
my ($cmd, $res) = @_;
my $basedir = "$mars/resource-$res";
# preconditions
if (! -d $basedir) {
lprint "resource directory '$basedir' does no longer exist.\n";
return;
}
my @host_list = get_total_peers($res);
my $cnt = scalar(@host_list);
if ($cnt > 0) {
my $h_list = join(',', @host_list);
ldie "resource '$res' is not empty: first remove the hosts '$h_list' via leave-resource\n" unless $force;
lwarn "BRUTE FORCE resource destruction: '$res' has $cnt members ($h_list) THESE ARE FINALLY TRASHED right now -- you are RESPONSIBLE for any subsequent problems.\n";
}
for my $path (`find $basedir/* | sort -r`) {
chomp $path;
next if $path =~ m:/(maxnr$|\.deleted-):;
_create_delete($path);
}
set_link("1", "$mars/resource-$res/work-$host");
finish_links();
_wait_delete();
_reset_resources();
_systemd_trigger($cmd);
}
sub logrotate_res {
my ($cmd, $res) = @_;
# report any upgrades / downgrades
set_link($marsadm_version, "$mars/actual-$host/marsadm-version");
if ($force) {
lwarn "operation $cmd --force is DANGEROUS\n";
} else {
return if check_primary($cmd, $res, 0, 1);
}
my $log_glob = "$mars/resource-$res/log-*-$host";
lprint "get logfiles '$log_glob'\n";
my @paths = lamport_glob($log_glob);
my $last;
if (@paths) {
@paths = sort alphanum_cmp @paths;
lprint "found " . scalar(@paths) . " logfiles.\n";
$last = pop(@paths);
} else {
# By definition, logrotate an old logfile must have been produced
# by the _same_ host. Only relevant for dangerous cron --force.
lprint "nothing to $cmd: no old logfiles for '$host' exist.\n";
return;
}
if (-z $last) {
lprint "an empty logfile '$last' already exists, nothing to do.\n" if $verbose;
return;
}
# prevent too many small logfiles when secondaries are not catching up
if (scalar(@paths) >= $thresh_logfiles) {
my @stat = stat($last);
my $size = 0;
if (@stat) {
$size = $stat[7] / (1000 * 1000 * 1000);
}
if ($size < $thresh_logsize) {
lprint "current logfile '$last' is smaller than $thresh_logsize GB, skipping logrotate.\n" if $verbose;
return;
}
}
my $nr = $last;
$nr =~ s/^.*log-([0-9]+)-.+$/$1/;
my $next = sprintf("$mars/resource-$res/log-%09d-$host", $nr + 1);
ldie "logfile '$next' already exists\n" if file_exists($next);
safe_creat($next) unless $dry_run;
my $startnr = get_link("$mars/resource-$res/maxnr", 1);
$startnr = $nr + 1 if ($nr >= $startnr);
set_link("$startnr", "$mars/resource-$res/maxnr");
}
sub _get_deletable_logfiles {
my ($cmd, $res) = @_;
my $min = -1;
my $max = -1;
my %replay_links;
my $max_restarts = 3;
# Race prevention: get the replay links first, afterwards the logfiles
restart:
my @replay_paths = lamport_glob("$mars/resource-$res/replay-*") or ldie "cannot find any replay symlinks\n";
foreach my $path (@replay_paths) {
my $target = get_link($path, 1);
if (!$target) {
ldie "cannot get link '$path'\n" if $max_restarts-- < 0;
goto restart;
}
$replay_links{$path} = $target;
}
my @log_paths = lamport_glob("$mars/resource-$res/log-*") or ldie "cannot find any logfiles\n";
foreach my $path (@log_paths) {
$path =~ m/\/log-([0-9]+)-/;
my $nr = $1;
$min = $nr if ($nr < $min || $min < 0);
$max = $nr if ($nr > $max || $max < 0);
}
foreach my $path (@replay_paths) {
my $target = $replay_links{$path};
$target =~ m/^log-([0-9]+)/;
my $nr = $1;
$max = $nr if ($nr < $max || $max < 0);
}
return ($min, $max);
}
# for old deletion method (to disappear)
my %delete_nrs;
sub _get_delete_nr {
my ($basedir, $total) = @_;
return @{$delete_nrs{$basedir}} if defined($delete_nrs{$basedir});
_scan_caches() unless %total_peers;
my $min_nr = 0;
my $max_nr = 0;
foreach my $path (lamport_glob("$basedir/deleted-*")) {
$path =~ m:/deleted-(.+):;
my $peer = $1;
next unless $peer;
next unless $total || $peer eq $real_host || $any_peers{$peer};
my $link = get_link($path, 1);
$link =~ m/0*([0-9]+)/;
$min_nr = $1 if (defined($1) && ($1 < $min_nr || !$min_nr));
$max_nr = $1 if (defined($1) && $1 > $max_nr);
}
my @deletes = lamport_glob("$basedir/delete-*");
foreach my $path (@deletes) {
$path =~ m:/delete-0*([0-9]+)-(.+):;
my $nr = $1;
my $peer = $2;
next unless $peer;
next unless $total || $peer eq $real_host || $any_peers{$peer};
$max_nr = $nr if (defined($nr) && $nr > $max_nr);
}
my $nr_links = $max_nr - $min_nr;
if ($nr_links > $max_deletions / 2) {
$verbose = 1 if $verbose <= 0;
lwarn "Too many deletion links have accumulated into directory $basedir/.\n";
lwarn "Probably your networking / your firewall rules / another setup problem is causing this, and your monitoring does not notice it.\n";
lwarn "Please fix it.\n";
lwarn "When necessary, remove $basedir/delete-* link by hand.\n";
# There might be holes in the sequence numbers of delete-* links.
# Don't account for them when it comes to aborts.
if (scalar(@deletes) > $max_deletions) {
lwarn "URGENT: please fix it.\n";
ldie "Aborting for safety reasons\n" if $cron_mode && !$force;
}
}
$delete_nrs{$basedir} = [$min_nr, $max_nr];
return @{$delete_nrs{$basedir}};
}
sub _set_delete_nr {
my ($basedir, $min_nr, $max_nr) = @_;
$delete_nrs{$basedir} = [$min_nr, $max_nr];
}
my %del_dirs;
my %del_targets;
sub _replace_delete {
my ($basedir, $target) = @_;
if (!$del_dirs{$basedir}) {
foreach my $link (lamport_glob("$basedir/delete-*")) {
my $targ = readlink($link);
next unless $targ;
$del_targets{$targ} = $link;
}
$del_dirs{$basedir} = 1;
}
my ($min_nr, $delete_nr) = _get_delete_nr($basedir, 1);
if (defined($del_targets{$target})) {
my $old = $del_targets{$target};
$old =~ m:/delete-([0-9]+)-:;
my $old_nr = $1;
return $old if $old_nr >= $min_nr;
}
my $new = sprintf("$basedir/delete-%09d-$real_host", ++$delete_nr);
_set_delete_nr($basedir, $min_nr, $delete_nr);
return $new;
}
my $nr_deletions = 0;
sub _create_delete {
my ($target) = @_;
ldie "cannot delete: '$target' is no absolute path\n" unless $target =~ m:^/:;
if (!$compat_deletions) {
lprint "delete '$target'\n" if $verbose;
set_link(".deleted", $target);
return;
}
my $global_path = "$mars/todo-global";
my $new = _replace_delete($global_path, $target);
lprint "create symlink $new -> $target\n" if $verbose;
set_link($target, $new);
$nr_deletions++;
}
sub _wait_delete {
return if $dry_run;
lwarn "Do not run this in --parallel mode\n" if $child_prefix;
finish_links();
return if !$nr_deletions;
$nr_deletions = 0;
for (;;) {
my $del_link = "$mars/todo-global/deleted-$real_host";
my $deleted = get_link($del_link, 1);
if (!$deleted) {
return if !$compat_deletions;
lwarn "Old deletions: '$del_link' does not exist\n";
sleep_timeout();
return;
}
$deleted =~ s/^0+//;
my ($min_nr, $delete_nr) = _get_delete_nr("$mars/todo-global");
last if $deleted >= $delete_nr;
lprint "waiting for deletions to apply locally....\n";
sleep_timeout();
}
}
sub delete_file_cmd {
my $cmd = shift;
my $res = shift; # ignore this
foreach my $path (@_) {
check_userspace($path);
_create_delete($path);
}
}
sub _purge_del_glob {
my ($clean_glob, $min_time) = @_;
foreach my $path (glob($clean_glob)) {
next unless -l $path;
my $val = readlink($path);
next unless $val eq ".deleted";
my $age = get_link_stamp($path);
next unless $age;
if ($age < $min_time) {
lprint " remove deleted '$path'\n" if $verbose;
unlink($path);
}
}
}
sub _get_min_time {
my ($start_time, $res) = @_;
my $min_time = $start_time;
if (!$force) {
my %status = get_alive_links($res, "time", "*");
foreach my $peer (keys(%status)) {
my $stamp = $status{$peer};
$stamp = 0 if (!defined($stamp) || $stamp eq "");
$min_time = $stamp if $stamp < $min_time;
}
# protect against dead / decommissioned peers
my $back_time = $start_time - 3600 * $keep_backups;
$min_time = $back_time if $back_time > $min_time;
}
$min_time -= $window;
return $min_time;
}
# old deletion method, hopefully to disappear somewhen in future
sub _compat_purge_deleted {
my ($del_glob, $msg_glob) = @_;
my $start_time = mars_time();
foreach my $leftlink (lamport_glob($del_glob)) {
# remove outdated .deleted-* markers
if ($leftlink =~ m:/.deleted-:) {
my $info = get_link($leftlink, 2);
if ($info =~ m/^[0-9]+,[0-9]+$/) {
my ($serial, $mode) = split(/,/, $info);
my $todo_glob = "$mars/todo-global/deleted-*";
if ($mode) {
$leftlink =~ m:/resource-([^/]+)/:;
my $res = $1;
$todo_glob = "$mars/resource-$res/todo-$real_host/deleted-*"
}
# compute the minimum border
my $border = 0;
foreach my $deleted_link (lamport_glob($todo_glob)) {
my $limit = get_link($deleted_link, 2);
next if !$limit || $limit <= 1;
$border = $limit if (!$border || $limit < $border);
}
if ($serial < $border && $serial > 1) {
unlink($leftlink);
next;
}
}
}
my $stamp = get_link_stamp($leftlink);
next unless $stamp + 3600 * 24 < $start_time;
lprint " unlink '$leftlink'\n" if $verbose;
unlink($leftlink);
}
foreach my $leftlink (lamport_glob($msg_glob)) {
# cleanup historic msg-*
if ($leftlink =~ m:/msg-(conncetion-from-|additional-connection-):) {
lprint " unlink deprecated '$leftlink'\n" if $verbose;
unlink($leftlink);
next;
}
my $stamp = get_link_stamp($leftlink);
next unless $stamp + 3600 * 4 < $start_time;
lprint " unlink '$leftlink'\n" if $verbose;
unlink($leftlink);
}
}
my $purged_globally = 0;
sub link_purge_global {
my ($cmd) = @_;
return if $purged_globally++;
my $start_time = mars_time();
my $min_time = _get_min_time($start_time, "*");
# new deletion method
my $clean_glob = "$mars/{,*/}{.*,*}";
_purge_del_glob($clean_glob, $min_time);
# old deletion method
my $del_glob = "$mars/{,*/}{.tmp,.deleted,delete,work}-*";
my $msg_glob = "$mars/actual-*/msg-*";
_compat_purge_deleted($del_glob, $msg_glob);
# keep internal backups for at least 1 hour
if ($keep_backups < 1) {
$keep_backups = 1;
}
lprint "removing left-over symlinks...\n" if $verbose;
# remove any old alivelinks
get_global_versions();
my $kernel_compat = get_alive_link("compat-alivelinks", $host, 2);
# only when a new kernel is running
if (defined($kernel_compat) && $kernel_compat ne "") {
my $keep_time = 0;
my $glob = "{alive,buildtag,emergency,features,rest-space,time,tree,usable,used,compat-alive}";
# Notice: $kernel_compat might differ from $compat_alivelinks
# For example, this can happen temporarily during join-cluster & co
if (!$kernel_compat) {
lprint "NEW alivelinks active: purging OLD alivelinks\n" if $verbose;
$glob = "$mars/$glob-*";
} elsif ($compat_alivelinks && $kernel_strategy_version < 3) {
# This should happen only after true downgrades.
# Notice that newer kernels are writing _both_ variants
# during compat mode.
lprint "OLD alivelinks active: purging NEW alivelinks\n" if $verbose;
$glob = "$mars/actual-$host/$glob*";
$keep_time = $keep_backups unless $force;
} else {
$glob = "";
}
if ($glob) {
foreach my $leftlink (lamport_glob($glob)) {
if ($keep_time) {
my $stamp = get_link_stamp($leftlink);
next unless $stamp + 3600 * $keep_time < $start_time;
}
_create_delete($leftlink);
}
}
}
foreach my $leftlink (lamport_glob("$mars/backup*")) {
my $stamp = get_link_stamp($leftlink);
next unless $stamp + 3600 * $keep_backups < $start_time;
lprint " unlink '$leftlink'\n" if $verbose;
system("rm -rf $leftlink");
}
# remove any left-over probe dirs
my $probe_glob = "$mars/probe-*";
foreach my $path (glob($probe_glob)) {
my $age = get_link_stamp($path);
next unless ($age < $min_time);
system("rm -rf $path");
}
}
sub link_purge_res {
my ($cmd, $res) = @_;
link_purge_global($cmd);
my $start_time = mars_time();
my $min_time = _get_min_time($start_time, $res);
# new deletion method
my $clean_glob = "$mars/resource-$res/{,*/}{.*,*}";
_purge_del_glob($clean_glob, $min_time);
# old deletion method
my $del_glob = "$mars/resource-$res/{,*/}{.tmp,.deleted,delete,work}-*";
my $msg_glob = "$mars/resource-$res/actual-*/msg-*";
_compat_purge_deleted($del_glob, $msg_glob);
}
sub logdelete_res {
my ($cmd, $res) = @_;
lprint "removing left-over $res logfiles...\n" if $verbose;
my $start_time = mars_time();
my @paths = lamport_glob("$mars/resource-$res/log-*") or ldie "cannot find any logfiles\n";
@paths = sort alphanum_cmp @paths;
my ($min_deletable, $max_deletable) = _get_deletable_logfiles(@_);
lprint "min deletable logfile number: $min_deletable\n" if $verbose;
lprint "min non-deletable logfile number: $max_deletable\n" if $verbose;
if ($min_deletable >= $max_deletable) {
lprint "no logfiles are deletable.\n" if $verbose;
return;
}
if ($cmd eq "log-delete-one") {
$max_deletable = $min_deletable + 1; # delete only the first one
}
my $nr = 0;
my $first = shift(@paths);
for (;;) {
last unless $first;
my $next = shift(@paths);
# never delete the very last logfile
last unless $next;
# safeguard: only delete logfiles having a minium age
last if !$force && is_link_recent($first);
$nr = $first;
$nr =~ s/^.*log-([0-9]+)-.+$/$1/;
next unless $nr < $max_deletable;
lprint "chosen '$first' for deletion\n" if $verbose;
_create_delete($first);
$first = $next;
}
# Determine whether a parallel join-resource is ongoing
my $transient_join = 0;
foreach my $path (lamport_glob("$mars/resource-$res/device-*")) {
my $val = get_link($path, 1);
next unless $val;
my $stamp = get_link_stamp($path);
if ($val eq "(none)") {
lwarn "Transient join-resource detected at '$path'\n";
$transient_join = 1;
# Remove any transient links after a while
if ($stamp < 10 ) {
finish_links();
set_link($val, $path);
finish_links($stamp + 1);
} elsif ($stamp < 20) {
_create_delete($path);
}
next;
}
next if is_member($res, $host);
next unless is_guest($res, $host);
# remove any inactive guests after backup retention period
my $device_on_path = "$mars/resource-$res/actual-$host/if-on";
my $device_on_val = get_link($device_on_path, 1);
my $device_on_stamp = get_link_stamp($device_on_path);
$device_on_stamp = $stamp if $stamp > $device_on_stamp;
# is the guest inactive?
if (defined($device_on_val) &&
$device_on_val ne "" &&
!$device_on_val &&
$device_on_stamp > 20 &&
$device_on_stamp + $keep_backups < $start_time) {
lprint "Purging guest '$res'\n";
#_create_delete($path);
}
}
my @versionlinks = lamport_glob("$mars/resource-$res/version-*");
# When join-resource is humming, or during split-brain,
# don't remove versionlinks as far as possible.
if (scalar(@versionlinks) < $max_deletions / 8 &&
($transient_join ||
!detect_splitbrain($res, 1))) {
lwarn "Keeping some versionlinks\n";
return unless $force;
}
lprint "Removing left-over versionlinks...\n" if $verbose;
foreach my $versionlink (@versionlinks) {
my $nrv = $versionlink;
$nrv =~ s/^.*\/version-([0-9]+)-.+$/$1/;
# we need at least one more version link than logfiles for consistency checks
next unless $nrv < $max_deletable - 1;
_create_delete($versionlink);
}
# remove outdated split-cluster remains
my %peers;
foreach my $path (lamport_glob("$mars/ips/ip-*")) {
$path =~ m:/ip-(.*):;
my $peer = $1;
$peers{$peer} = 1;
}
foreach my $leftlink (lamport_glob("$mars/{,resource-$res/}todo-*/deleted-*")) {
$leftlink =~ m:/deleted-(.*):;
my $peer = $1;
next if $peers{$peer};
my $stamp = get_link_stamp($leftlink);
next unless $stamp + 3600 < $start_time;
lprint " unlink '$leftlink'\n" if $verbose;
unlink($leftlink);
}
}
sub cron_phase1 {
my ($cmd, $res) = @_;
$cron_mode = 1;
link_purge_res(@_);
logrotate_res(@_);
finish_links();
_trigger(3);
}
sub cron_phase2 {
my ($cmd, $res) = @_;
$cron_mode = 1;
logdelete_res(@_);
}
sub attach_res_phase0 {
my ($cmd, $res) = @_;
return if $force;
my $detach = ($cmd eq "detach");
if ($detach) {
my $device_in_use = get_link("$mars/resource-$res/actual-$host/open-count", 1);
if ($device_in_use) {
my $want_path = "$mars/resource-$res/systemd-want";
my $want = get_link($want_path, 2);
if ($want) {
lprint "IMPORTANT: Relying on systemd for $cmd of resource '$res'\n";
my $path = "$mars/resource-$res/todo-$host/attach";
_switch($cmd, $res, $path, 0);
finish_links();
systemd_activate($cmd, $res, 0, 1);
return;
}
my $dev = device_name($res);
ldie "device '$dev' is in use\n";
}
}
}
# only for systemd: wait that primary device is no longer open
sub attach_res_phase0b {
my ($cmd, $res) = @_;
return unless $cmd eq "detach";
return unless systemd_present(@_);
check_status($cmd, $res, "open-count", 0, 1);
wait_cluster($cmd);
}
sub attach_res_phase1 {
my ($cmd, $res) = @_;
my $detach = ($cmd eq "detach");
my $path = "$mars/resource-$res/todo-$host/attach";
_switch($cmd, $res, $path, !$detach);
}
sub attach_res_phase2 {
my ($cmd, $res) = @_;
my $detach = ($cmd eq "detach");
return if $force;
if (!is_module_loaded()) {
lwarn "Kernel module not loaded: $cmd will become effective after modprobe\n";
return;
}
check_status($cmd, $res, "is-attached", $detach ? 0 : 1, 1);
if ($detach) {
system("sync");
check_mars_device($cmd, $res, 1, 1);
check_status($cmd, $res, "is-replaying", 0, 1);
check_status($cmd, $res, "is-syncing", 0, 1);
system("sync");
}
}
sub fetch_global_res {
my ($cmd, $res) = @_;
my $pause = ($cmd =~ m/disconnect|pause/);
my @paths = lamport_glob("$mars/resource-$res/todo-*/");
for my $path (@paths) {
_switch($cmd, $res, "$path/connect", !$pause);
}
}
sub fetch_local_res {
my ($cmd, $res) = @_;
my $pause = ($cmd =~ m/disconnect|pause/);
my $path = "$mars/resource-$res/todo-$host/connect";
_switch($cmd, $res, $path, !$pause);
}
sub pause_sync_global_res {
my ($cmd, $res) = @_;
my $pause = ($cmd =~ m/pause/);
check_sync_startable(@_) if !$pause;
my @paths = lamport_glob("$mars/resource-$res/todo-*/");
for my $path (@paths) {
_switch($cmd, $res, "$path/sync", !$pause);
}
}
sub pause_sync_local_res {
my ($cmd, $res) = @_;
my $pause = ($cmd =~ m/pause/);
check_sync_startable(@_) if !$pause;
my $path = "$mars/resource-$res/todo-$host/sync";
_switch($cmd, $res, $path, !$pause);
}
sub pause_replay_global_res {
my ($cmd, $res) = @_;
my $pause = ($cmd =~ m/pause/);
my @paths = lamport_glob("$mars/resource-$res/todo-*/");
for my $path (@paths) {
_switch($cmd, $res, "$path/replay", !$pause);
}
}
sub pause_replay_local_res {
my ($cmd, $res) = @_;
my $pause = ($cmd =~ m/pause/);
my $path = "$mars/resource-$res/todo-$host/replay";
_switch($cmd, $res, $path, !$pause);
}
sub up_res_phase0 {
my ($cmd, $res) = @_;
my $down = ($cmd eq "down");
if ($down) {
attach_res_phase0("detach", $res);
} else {
attach_res_phase0("attach", $res);
}
}
sub up_res_phase1 {
my ($cmd, $res) = @_;
my $down = ($cmd eq "down");
if ($down) {
pause_replay_local_res("pause-replay-local", $res);
pause_sync_local_res("pause-sync-local", $res);
fetch_local_res("pause-fetch", $res);
attach_res_phase1("detach", $res);
} else {
attach_res_phase1("attach", $res);
fetch_local_res("resume-fetch-local", $res);
# ignore ldie on sync, just do all the rest
eval {
pause_sync_local_res("resume-sync-local", $res);
};
pause_replay_local_res("resume-replay-local", $res);
}
}
sub up_res_phase2 {
my ($cmd, $res) = @_;
my $down = ($cmd eq "down");
if ($down) {
attach_res_phase2("detach", $res);
} else {
attach_res_phase2("attach", $res);
}
}
sub set_replay_res {
my ($cmd, $res, $new_nr) = @_;
if (!$new_nr || $new_nr <= 0) {
ldie "you must supply a numeric logfile number as third argument.\n";
}
check_not_primary($cmd, $res);
check_todo($cmd, $res, "replay", 0, 0);
my $replaylink = "$mars/resource-$res/replay-$host";
my $old_val = get_link($replaylink);
my $old_nr = $old_val;
$old_nr =~ s/log-([0-9]+)-.*/$1/;
ldie "old log number '$old_nr' is wrong\n" unless $old_nr > 0;
if ($new_nr > $old_nr) {
lwarn "you try to skip logfile numbers from $old_nr to $new_nr, are you sure?\n";
ldie "you would need --force if you really know what you are doing.\n" unless $force;
}
my $primary = _get_designated_primary($res);
_set_replaylink($cmd, $res, $new_nr, $primary);
}
sub fake_sync_phase1 {
my ($cmd, $res) = @_;
my $path = "$mars/resource-$res/todo-$host/sync";
_switch($cmd, $res, $path, 0);
}
sub fake_sync_phase2 {
my ($cmd, $res) = @_;
check_status($cmd, $res, "is-syncing", 0, 1);
}
sub fake_sync_phase3 {
my ($cmd, $res) = @_;
my $size = get_link("$mars/resource-$res/size");
my $target = "$mars/resource-$res/syncstatus-$host";
set_link($size, $target);
}
sub _primary_res {
my ($res, $new, $old) = @_;
my $pri = "$mars/resource-$res/primary";
finish_links();
my $old_stamp = get_link_stamp($pri);
set_link($new, $pri);
if ($new eq "(none)") {
# Allow overriding of secondaries in a partitioned cluster
# by using a weaker timestamp.
# When somebody issues a (more or less) _concurrent_ "primary" command
# during a network partition, let the "primary" win over the "secondary".
# Notice: the definition of "concurrent" in the sense of Lamport
# (where _any_ communication may take "arbitrarily" long)
# may be non-intuitive to humans in the presence of a network outage.
finish_links($old_stamp + 1) if $old_stamp;
} else {
my $last_lnk = "$mars/resource-$res/userspace/last-primary";
my $prev_lnk = "$mars/resource-$res/userspace/prev-primary";
system("mv -f $last_lnk $prev_lnk");
set_link($new, $last_lnk);
}
lprint "designated primary changed from '$old' to '$new'\n";
}
# check whether primary/secondary switching is possible at all
sub primary_phase0 {
my ($cmd, $res) = @_;
ldie "cannot switch primary: mars kernel module is not loaded\n" unless ($cmd eq "secondary" || -d "/proc/sys/mars");
if ($force) {
check_todo($cmd, $res, "fetch", 0, 0);
}
my $old = _get_designated_primary($res, -1);
lprint "Current designated primary: $old\n";
if ($cmd eq "primary") {
if ($host ne $old) {
lprint "Allowing handover in cases of sync: ignore_sync=$ignore_sync\n" if $ignore_sync;
check_sync_finished($res, $host, $ignore_sync);
# also check that other secondaries won't loose their sync primary
my @names = lamport_glob("$mars/resource-$res/data-*");
# for k <= 2 replicas, the previous check must have been sufficient
if (scalar(@names) > 2) {
my $allow_anyway = ($force || $ignore_sync);
lprint "Allowing handover in cases of sync: force=$force ignore_sync=$ignore_sync\n" if $allow_anyway;
foreach my $name (@names) {
$name =~ m:/data-(.+):;
my $peer = $1;
next if ($peer eq $old || $peer eq $host);
check_sync_finished($res, $peer, $allow_anyway);
}
}
}
check_todo($cmd, $res, "attach", 1, 0);
check_todo($cmd, $res, "fetch", 1, 0) if !$force;
check_todo($cmd, $res, "replay", 1, 0);
# check that no logfile replay errors exist.
my $replay_error = get_link("$mars/resource-$res/actual-$host/replay-code", 2);
if (defined($replay_error) && $replay_error ne "" && int($replay_error) < 0) {
lwarn "Logfile replay / recovery stopped with error code $replay_error.\n";
ldie "Won't switch to avoid unnoticed data loss. You may however do a 'primary --force'.\n" unless $force;
}
}
my $want_path = "$mars/resource-$res/systemd-want";
my $want = get_link($want_path, 2);
if ($want) {
my $new;
my $oper;
if ($cmd eq "primary") {
$new = $host;
$oper = "start";
} else {
$new = "(none)";
$oper = "stop";
}
set_link($new, $want_path);
my $unit_path = "$mars/resource-$res/systemd-$oper-unit";
my $unit = get_link($unit_path, 2);
if ($unit) {
if ($old ne "(none)") {
my $response_path = "$mars/resource-$res/userspace/systemd-status-stop-$old";
set_link(0, $response_path);
}
lprint "IMPORTANT: Relying on systemd for $oper of unit '$unit'\n";
lprint "IMPORTANT: unit '$unit' wanted at '$new'\n";
finish_links();
_systemd_trigger($cmd);
_trigger(3);
return;
}
}
return if ($old eq $host and $cmd eq "primary");
return if $old eq "(none)";
my $open_count_path = "$mars/resource-$res/actual-$old/open-count";
my $device_in_use = get_link($open_count_path, 1);
if ($device_in_use) {
my $dev = device_name($res, $old);
lwarn "device '$dev' for resource '$res' is $device_in_use times in use on primary host '$old'\n";
ldie "first you must umount/close the device (on host '$old')\n" unless $force;
lwarn "First you SHOULD umount/close the device (on host '$old'), but you ignore this recommendation by giving the --force option.\n";
if (is_link_recent($open_count_path)) {
lwarn "You are forcing a SPLIT BRAIN via --force right now. Do you know that this is an ERRONEOUS state? Do you really know what you are doing?\n";
} else {
lwarn "You may produce a SPLIT BRAIN via --force because the peer host '$old' is currently not reachable.\n";
}
}
lprint "all preconditions OK for resource '$res'\n";
}
# only for primary --force: wait until fetch has actually stopped
sub primary_phase0a {
my ($cmd, $res) = @_;
my $new = $host;
if (!$force && $cmd =~ m/primary/) {
lprint "Prepare new primary '$new' handover\n";
_switch($cmd, $res, "$mars/resource-$res/todo-$new/fetch", 1);
}
finish_links();
return unless $force;
wait_cond($cmd, $res, "is-fetch-off");
}
# only for systemd: wait that primary device is no longer open
sub primary_phase0b {
my ($cmd, $res) = @_;
return unless systemd_present(@_);
# only relevant for true handover
my $old = _get_designated_primary($res, -1);
return if ($old eq $host || $old eq "(none)");
# ignore primary --force
my $connect_path = "$mars/resource-$res/todo-$host/connect";
my $connect = get_link($connect_path, 1);
return if (!defined($connect) || !$connect);
# Notice: this is a workaround for a problem which is
# outside of our scope. For example, a remote umount will
# fail when any (delayed) process has any filehandle open.
# As long as the umount was unsuccessful, we _cannot_ proceed
# with planned handover.
# As a workaround, we constantly trigger the remote systemd
# in the _hope_ that the umount will succeed, and the
# open-count will then go down to zero, hopefully somewhen.
my $watch = "$mars/resource-$res/systemd-want";
my $action = "";
if (link_exists($watch)) {
$action = "system(\"touch -h $watch\");";
my $response_path = "$mars/resource-$res/userspace/systemd-status-stop-$old";
$action .= "\$action_status = get_link(\"$response_path\");";
my $msg = "systemctl stop on peer $old: status=\$action_status\n";
$action .= "ldie \"$msg\" if \$action_status;";
}
# try to compensate failures by systemd restart
my $compensation = undef;
if ($old ne "(none)") {
$compensation = "lprint \"Restarting '$res' on '$old'\n\"; ";
$compensation .= "_primary_res(\"$res\", \"$old\", \"(none)\"); ";
$compensation .= "set_systemd_want(\"$cmd\", \"$res\", \"$old\"); ";
$compensation .= "finish_links(); ";
$compensation .= "_trigger(3); ";
$compensation .= "lprint \"Triggered systemd at '$old'.\n\";";
}
check_status($cmd, $res, "open-count", 0, 1, undef, undef, $old, $action, $compensation);
}
# when necessary, switch to secondary (intermediately)
sub primary_phase1 {
my ($cmd, $res) = @_;
return 0 if ($force and $cmd eq "primary");
my $old = _get_designated_primary($res, -1);
return 0 if ($old eq $host and $cmd eq "primary");
my $new = "(none)";
if (!$force and $cmd eq "primary") {
my $status = try_to_avoid_splitbrain($cmd, $res, $old);
return $status if $status;
}
return 0 if $old eq $new;
_primary_res($res, $new, $old);
return 0;
}
my $phase2_waited = 0;
sub primary_phase1b {
my ($cmd, $res) = @_;
$phase2_waited = 0;
finish_links();
return 0 if $force;
my $old = _get_designated_primary($res, -1);
my $status = check_primary_gone($cmd, $res, $old);
return $status if $status;
if (!$force and $cmd eq "primary") {
my $status = try_to_avoid_splitbrain($cmd, $res, $old);
return $status if $status;
}
return 0;
}
# when necessary, wait
sub primary_phase2 {
my ($cmd, $res) = @_;
return 0 if $force;
return 0 unless $cmd eq "primary";
wait_cluster($cmd) if !$phase2_waited++;
my $old = _get_designated_primary($res, -1);
return check_primary_gone($cmd, $res, $old);
}
sub primary_phase2b {
my ($cmd, $res) = @_;
return 0 if $force;
if (systemd_present(@_)) {
my $old = _get_designated_primary($res, -1);
return try_to_avoid_splitbrain($cmd, $res, $old);
}
return 0;
}
# when necessary, switch to primary
sub primary_phase3 {
my ($cmd, $res) = @_;
return unless $cmd eq "primary";
my $old = _get_designated_primary($res, -1);
my $new = $host;
_primary_res($res, $new, $old);
}
sub primary_phase3b {
finish_links();
}
# wait for device to appear / disappear
sub primary_phase4 {
my ($cmd, $res) = @_;
if($cmd eq "secondary") {
check_mars_device($cmd, $res, 1, 1);
return;
}
my $ok = detect_splitbrain($res, 1);
if (!$ok) {
lwarn "\n";
lwarn "Sorry, in split brain situations I can only set the _designated_\n";
lwarn "primary, but I cannot _guarantee_ that becoming the\n";
lwarn "_actual_ primary is always possible.\n";
lwarn "You SHOULD resolve the split brain ASAP (e.g. by leave-resource\n";
lwarn "or invalidate etc).\n";
lwarn "\n";
lwarn "If you already tried to resolve the split brain manually, but\n";
lwarn "this message does not disappear, the reason could be some\n";
lwarn "hindering left-overs/remains from the former split brain.\n";
lwarn "ONLY in such a case, try log-purge-all --force.\n";
lwarn "\n";
return;
}
check_mars_device($cmd, $res, 1, 0);
# new switch semantics, when nothing has failed before: up
up_res_phase1(@_);
_systemd_trigger($cmd);
}
sub wait_umount_res {
my ($cmd, $res) = @_;
my $path = "$mars/resource-$res/actual-$host/open-count";
while (1) {
my $sum = get_link($path);
last if !$sum;
lprint "device for resource '$res' is $sum times in use on $host\n";
sleep_timeout(1);
}
lprint "OK, device for resource '$res' is not in use.\n";
}
sub invalidate_res_phase0 {
my ($cmd, $res) = @_;
check_not_primary($cmd, $res);
my $primary = _get_designated_primary($res);
ldie "for operation '$cmd', some other designated primary must exist (currently there is none)\n" if $primary eq "(none)";
ldie "Cannot invalidate the designated primary host '$primary'\n" if $primary eq $host;
}
sub invalidate_res_phase1 {
my ($cmd, $res) = @_;
_switch($cmd, $res, "$mars/resource-$res/todo-$host/attach", 0);
_switch($cmd, $res, "$mars/resource-$res/todo-$host/sync", 0);
_switch($cmd, $res, "$mars/resource-$res/todo-$host/fetch", 0);
_switch($cmd, $res, "$mars/resource-$res/todo-$host/replay", 0);
}
sub invalidate_res_phase2 {
my ($cmd, $res) = @_;
if (!$force) {
check_status($cmd, $res, "is-syncing", 0, 1);
check_status($cmd, $res, "is-fetching", 0, 1);
check_status($cmd, $res, "is-replaying", 0, 1);
check_status($cmd, $res, "is-attached", 0, 1);
}
}
sub invalidate_res_phase3 {
my ($cmd, $res) = @_;
my $dst = "$mars/resource-$res/syncstatus-$host";
my $primary = _get_designated_primary($res);
ldie "Cannot execute 'invalidate' because noone is designated as primary.\n" if (!$primary || $primary eq "(none)");
ldie "Cannot invalidate the designated primary host '$primary'\n" if $primary eq $host;
my $replay = get_link("$mars/resource-$res/replay-$primary");
$replay =~ m/^log-([0-9]+)-/ or ldie "replay link '$replay' is not parsable\n";
my $replay_nr = $1;
set_link("0", $dst);
finish_links(); # opportunity for errors => don't continue
for my $vers_path (lamport_glob("$mars/resource-$res/version-*-$host")) {
$vers_path =~ m:/version-([0-9]+):;
my $this_nr = $1;
_create_delete($vers_path) if $this_nr >= $replay_nr;
}
_create_delete("$mars/resource-$res/replay-$host");
finish_links();
}
sub invalidate_res_phase4 {
my ($cmd, $res) = @_;
_wait_delete();
}
sub invalidate_res_phase5 {
my ($cmd, $res) = @_;
$force = 0; # this would be too dangerous
log_purge_res(@_);
err_purge_res(@_);
}
sub invalidate_res_phase6 {
my ($cmd, $res) = @_;
_wait_delete();
}
sub invalidate_res_phase7 {
my ($cmd, $res) = @_;
my $dst = "$mars/resource-$res/syncstatus-$host";
my $primary = _get_designated_primary($res);
ldie "Cannot execute 'invalidate' because noone is designated as primary.\n" if (!$primary || $primary eq "(none)");
ldie "Cannot invalidate the designated primary host '$primary'\n" if $primary eq $host;
my $replay = get_link("$mars/resource-$res/replay-$primary");
$replay =~ m/^log-([0-9]+)-/ or ldie "replay link '$replay' is not parsable\n";
my $replay_nr = $1;
_set_replaylink($cmd, $res, $replay_nr, $primary, "");
finish_links();
}
sub invalidate_res_phase8 {
my ($cmd, $res) = @_;
_wait_delete();
}
sub invalidate_res_phase9 {
my ($cmd, $res) = @_;
my $dst = "$mars/resource-$res/syncstatus-$host";
my $primary = _get_designated_primary($res);
ldie "Cannot execute 'invalidate' because noone is designated as primary.\n" if (!$primary || $primary eq "(none)");
ldie "Cannot invalidate the designated primary host '$primary'\n" if $primary eq $host;
_switch($cmd, $res, "$mars/resource-$res/todo-$host/attach", 1);
_switch($cmd, $res, "$mars/resource-$res/todo-$host/fetch", 1);
_switch($cmd, $res, "$mars/resource-$res/todo-$host/replay", 1);
finish_links();
my $emergency_path = "$mars/resource-$res/actual-$primary/has-emergency";
my $emergency = get_link($emergency_path, 1);
if ($emergency) {
lwarn "Primary '$primary' is in emergency mode. Cannot start sync.\n";
} else {
_switch($cmd, $res, "$mars/resource-$res/todo-$host/sync", 1);
}
}
my %resize_device_size;
my %resize_old_size;
my %resize_new_size;
sub get_possible_size {
my ($cmd, $res) = @_;
my @actsizes = lamport_glob("$mars/resource-$res/actsize-*");
lwarn "resource $res has no actsize-* symlinks\n" unless @actsizes;
my $possible_size = 0;
foreach my $actsize (@actsizes) {
my $this_size = get_link($actsize);
if (!$possible_size || $this_size < $possible_size) {
$possible_size = $this_size;
}
}
return $possible_size;
}
sub resize_phase0 {
my ($cmd, $res, $size_arg) = @_;
ldie "mars kernel module is not loaded. This is needed for communication with some other hosts!\n" if !is_module_loaded();
my $new_size = 0;
if ($size_arg) {
$new_size = get_size($size_arg);
ldie "optional size argument '$new_size' must be numeric and positive\n" unless scalar($new_size) > 0;
lprint "new size: $new_size bytes\n";
}
check_primary($cmd, $res, 1);
my $my_size = get_link("$mars/resource-$res/actsize-$host");
my $lnk = "$mars/resource-$res/size";
my $old_size = get_link($lnk);
lprint "old_size=$old_size\n";
my $possible_size = get_possible_size($cmd, $res);
lprint "possible_size=$possible_size\n";
$new_size = $possible_size if !$new_size;
lprint "new_size=$new_size\n";
ldie "new size $new_size is higher than the possible size (minimum of all volumes) $possible_size" if $new_size > $possible_size; # no override with --force possible
# disallow decreasing
ldie "only increases of the size are possible!\n" if $new_size < $old_size;
my $waste = $my_size - $new_size;
lwarn "You are wasting $waste bytes locally\n" if $my_size > $new_size;
# remember values
$resize_device_size{$res} = _get_mars_size(@_);
$resize_old_size{$res} = $old_size;
lwarn "internal mismatch between actual device size and resource size: $resize_device_size{$res} != $resize_old_size{$res}\n" unless $resize_device_size{$res} == $resize_old_size{$res};
$resize_new_size{$res} = $new_size;
return 0;
}
sub resize_phase1 {
my ($cmd, $res) = @_;
my $old_size = $resize_old_size{$res} or ldie "bad internal size value\n";
my $new_size = $resize_new_size{$res} or ldie "bad internal size value\n";
# for safety, check again
ldie "only increases of the size are possible!\n" if $new_size < $old_size;
check_primary($cmd, $res, 1);
# Mark the primary data / its size as authoritative
my $act_lnk = "$mars/resource-$res/syncstatus-$host";
set_link($new_size, $act_lnk);
finish_links(); # Chance for errors to pop up
# Now set the new resource size
my $lnk = "$mars/resource-$res/size";
set_link($new_size, $lnk);
finish_links();
}
sub resize_phase2 {
my ($cmd, $res) = @_;
my $old_size = $resize_old_size{$res} or ldie "bad internal size value\n";
my $new_size = $resize_new_size{$res} or ldie "bad internal size value\n";
for (;;) {
my $new_device_size = _get_mars_size(@_);
if ($new_device_size == $resize_new_size{$res}) {
lprint "Device size is now $new_device_size.\n";
last;
}
lprint "Device size $new_device_size has not yet reached the new size $resize_new_size{$res}.\n";
if ($new_device_size != $resize_device_size{$res}) {
lwarn "The size has changed, but did not reach the correct value.";
lwarn "Assuming some rounding problems (which may occur at some device types)\n";
last;
}
sleep_timeout();
}
}
# Remark: this is historic, and should not be used anymore.
# However, it likely needs to be kept for DRBD similarity.
sub role_cmd {
my ($cmd, $res, $peer) = @_;
$peer = $host unless (defined($peer) && $peer);
my $is_primary = is_actual_primary($cmd, $res, $peer);
my $todo_primary = _get_designated_primary($res);
my $msg = "I am actually ";
$msg .= $is_primary ? "primary" : "secondary";
if ($todo_primary ne $peer) {
$msg .= " and $todo_primary should be primary";
}
lprint $msg . "\n";
}
# Deprecated. Check whether it can be removed.
sub mars_state_cmd {
my ($cmd, $res, $peer) = @_;
$peer = $host unless (defined($peer) && $peer);
my $is_primary = is_actual_primary($cmd, $res, $peer);
my $todo_primary = _get_designated_primary($res);
if ($is_primary) {
lprint "is_primary\n";
return;
}
if ($todo_primary eq $peer) {
lprint "becoming_primary\n";
return;
}
# secondary without ambitions to become primary
my $size = get_link("$mars/resource-$res/size");
my $syncstatus = get_link("$mars/resource-$res/syncstatus-$peer");
if ($syncstatus != $size) {
lprint "secondary inconsistent ($syncstatus bytes of $size)\n";
return;
}
if ($todo_primary eq "(none)") {
lprint "secondary\n";
return;
}
my $primary_replay = get_link("$mars/resource-$res/replay-$todo_primary");
my $peer_replay = get_link("$mars/resource-$res/replay-$peer");
if ($primary_replay eq $peer_replay) {
lprint "secondary uptodate\n";
return;
}
lprint "secondary outdated ($peer_replay instead of $primary_replay)\n";
}
sub cat_cmd {
my $cmd = shift;
foreach my $path (@_) {
_get_text($path, undef, 1);
}
}
sub mars_info_cmd {
my ($cmd, $res) = @_;
my $info = "$mars/resource-$res/logstatus-$host.status";
cat_cmd($cmd, $info);
}
sub show_cmd {
my ($cmd, $res) = @_;
$res = "*" if !$res || $res eq "all";
$res = "{$res}" if $res =~ m/,/;
my $glob = "$mars/{ips/ip-$host,alive-$host,emergency-$host,rest-space-$host,resource-$res/{device,primary,size,actsize-$host,syncstatus-$host,replay-$host,actual-$host/*,todo-$host/*}}";
foreach my $link (lamport_glob($glob)) {
next unless link_exists($link);
my $res = get_link($link);
my $short = $link;
$short =~ s:^$mars/::;
lprint "$short=$res\n";
}
}
sub show_errors_cmd {
my ($cmd, $res) = @_;
my $text = get_error_text(@_);
if ($text) {
lprint $text;
ldie "resource $res has some (old) problems.\n";
} else {
lprint "no errors/warnings are reported.\n";
}
}
sub version {
lprint "$0 Version: $marsadm_version $Id\n";
#lprint "my IP is $ip\n";
exit 0;
}
##################################################################
# pretty printing
sub seconds2human {
my $seconds = shift;
return "--:--:--" unless (defined($seconds) && $seconds ne "" && $seconds >= 0);
return sprintf("%02d:%02d:%02d", $seconds / 3600, ($seconds % 3600) / 60, $seconds % 60);
}
sub number2human {
my ($number, $unit, $max) = @_;
$max = $number if !defined($max);
my $k = 1024.0;
my $flip_over = 9.99;
if (!defined($unit) || !$unit) {
if ($max >= $k * $k * $k * $k * $flip_over) {
$unit = "T";
} elsif ($max >= $k * $k * $k * $flip_over) {
$unit = "G";
} elsif ($max >= $k * $k) {
$unit = "M";
} elsif ($max >= $k) {
$unit = "K";
} else {
$unit = "";
}
}
my $i = "i";
if ($unit =~ m/^[a-z]/) {
$i = "";
$k = 1000.0;
}
$_ = $unit;
SWITCH: {
if (/t/i) {
$number /= $k * $k * $k * $k;
$unit = "T${i}B";
last SWITCH;
}
if (/g/i) {
$number /= $k * $k * $k;
$unit = "G${i}B";
last SWITCH;
}
if (/m/i) {
$number /= $k * $k;
$unit = "M${i}B";
last SWITCH;
}
if (/k/i) {
$number /= $k;
$unit = "K${i}B";
last SWITCH;
}
$unit = "B";
}
if ($unit eq "B") {
$number = int($number + 0.1);
} else {
$number = sprintf("%.3f", $number);
}
return "$number $unit";
}
sub progress_bar {
my ($length, $min, $mid, $max) = @_;
$min = 0 if $min < 0;
$mid = $min if $mid < $min;
$max = $mid if $max < $mid;
$max = 1 if $max < 1;
my $pos1 = $length * $min / $max;
my $count = $pos1 - 1;
$count = 0 if $count < 0;
my $bar = '=' x $count;
if ($pos1 < $length) {
$bar .= ">";
$pos1++;
}
my $pos2 = $length * $mid / $max;
if ($pos1 < $pos2) {
$bar .= ':' x ($pos2 - $pos1);
}
if ($pos2 < $length) {
$bar .= '.' x ($length - $pos2);
}
return "[$bar]";
}
##################################################################
# macro evaluation
sub make_numeric {
my $number = shift;
return 0 if (!defined($number) || $number eq "");
return $number;
}
sub set_args {
my $outer_env = shift;
my $inner_env = shift;
$$inner_env{"callstack"} .= "," if $$inner_env{"callstack"};
$$inner_env{"callstack"} .= ${_[0]};
my $index = 0;
while (defined(my $next = shift)) {
my $arg = parse_macro($next, $outer_env);
$$inner_env{$index++} = $arg;
}
# clear all other number variables to avoid confusion
while (defined($$inner_env{$index})) {
undef $$inner_env{$index++};
}
}
# evaluate a primitive macro
sub eval_fn {
my $env = shift;
my $fn = shift;
# optionally allow primitive macros without prefix primitive- to be substituted
if ($fn !~ s/^primitive[-_]?//) {
my $macro = get_macro($fn, 1);
if ($macro) {
set_args($env, $env, $fn, @_);
return parse_macro($macro, $env);
}
}
my $arg1 = shift;
$_ = $fn;
SWITCH: {
if (/^$/) { # variable
my $varname;
# prefix *crement operators
if ($arg1 =~ m/^([-+]{2})$/) {
my $op = $arg1;
$varname = parse_macro(shift, $env);
if ($op =~ m/^\+/) {
$$env{$varname}++;
} else {
$$env{$varname}--;
}
} else {
$varname = parse_macro($arg1, $env);
}
my $result = "";
if (defined($$env{$varname})) {
$result = $$env{$varname};
}
# postfix *crement operators
if (defined(${_[0]}) && ${_[0]} =~ m/^([-+]{2})$/) {
my $op = shift;
if ($op =~ m/^\+/) {
$$env{$varname}++;
} else {
$$env{$varname}--;
}
}
# provisionary light-weight arrays based on CSV format
if (defined(my $delim = shift) && defined(my $index = shift)) {
$delim = parse_macro($delim, $env);
$index = parse_macro($index, $env);
my @list = split($delim, $result);
# get last element when denoted
$index = scalar(@list) - 1 if ($index eq "" || $index eq "*");
$result = $list[$index];
}
return $result;
}
if (/^let$/) { # assignment
my $varname = parse_macro($arg1, $env);
my $arg2 = shift;
my $value = parse_macro($arg2, $env);
$$env{$varname} = $value;
return "";
}
if (/^append$/) { # .=
my $varname = parse_macro($arg1, $env);
my $arg2 = shift;
my $value = parse_macro($arg2, $env);
$$env{$varname} .= $value;
return "";
}
if (/^set$/) { # provisionary light-weight arrays based on CSV format
my $varname = parse_macro($arg1, $env);
my $delim = shift;
$delim = parse_macro($delim, $env);
my $index = shift;
$index = parse_macro($index, $env);
my @list = split($delim, $$env{$varname});
# append to list when denoted
$index = scalar(@list) if ($index eq "" || $index eq "*");
my $value = shift;
$value = parse_macro($value, $env);
$list[$index] = $value;
$$env{$varname} = join($delim, @list);
return "";
}
if (/^dump[-_]?vars$/) { # write to stderr
foreach my $key (sort alphanum_cmp keys(%$env)) {
next if $key =~ m/^__.*__$/;
my $val = $$env{$key};
lprint_stderr "$key='$val'\n";
}
return "";
}
if (/^([-+*\/%&|^]|>>|<<|min|max)$/) { # arithmetic / associative operators
my $op = $1;
my $number = make_numeric(parse_macro($arg1, $env));
while (defined(my $next = shift)) {
my $operand = make_numeric(parse_macro($next, $env));
$_ = $op;
if (/^\+$/) { $number += $operand; next; }
if (/^-$/) { $number -= $operand; next; }
if (/^\*$/) { $number *= $operand; next; }
if (/^\/$/) { $number /= $operand; next; }
if (/^%$/) { $number %= $operand; next; }
if (/^&$/) { $number &= $operand; next; }
if (/^\|$/) { $number |= $operand; next; }
if (/^\^$/) { $number ^= $operand; next; }
if (/^<<$/) { $number <<= $operand; next; }
if (/^>>$/) { $number >>= $operand; next; }
if (/^min$/) { $number = $operand if $number < $operand; next; }
if (/^max$/) { $number = $operand if $number > $operand; next; }
ldie "bad arithmetic operator '$op'";
}
return $number;
}
if (/^([<>]=?|[!=]=)$/) { # numeric comparisons
my $op = $1;
my $n1 = make_numeric(parse_macro($arg1, $env));
my $arg2 = shift;
my $n2 = make_numeric(parse_macro($arg2, $env));
$_ = $op;
if (/^<$/) { return $n1 < $n2; }
if (/^>$/) { return $n1 > $n2; }
if (/^<=$/) { return $n1 <= $n2; }
if (/^>=$/) { return $n1 >= $n2; }
if (/^==$/) { return $n1 == $n2; }
if (/^!=$/) { return $n1 != $n2; }
ldie "bad comparison operator '$op'";
}
if (/^(lt|gt|le|ge|eq|ne|match|=~)$/) { # binary string operators
my $op = $1;
$op = "=~" if $op eq "match";
my $n1 = parse_macro($arg1, $env);
my $arg2 = shift;
my $n2 = parse_macro($arg2, $env);
$_ = $op;
if (/^lt$/) { return $n1 lt $n2; }
if (/^gt$/) { return $n1 gt $n2; }
if (/^le$/) { return $n1 le $n2; }
if (/^ge$/) { return $n1 ge $n2; }
if (/^eq$/) { return $n1 eq $n2; }
if (/^ne$/) { return $n1 ne $n2; }
if (/^=~$/) {
my $opts = "m";
my $arg3 = shift;
$opts = parse_macro($arg3, $env) if defined($arg3);
# unfortunately standard regex operators don't seem to accept variable options, so we use eval()
my $result = eval("\$n1 =~ m{$n2}$opts");
return "" unless defined($result);
return $result;
}
ldie "bad binary operator '$op'";
}
if (/^(&&|\|\||and|or)$/) { # shortcut operators
my $op = $1;
$op = "&&" if $op eq "and";
$op = "||" if $op eq "or";
my $number = parse_macro($arg1, $env);
while (defined(my $next = shift)) {
$_ = $op;
if (/^&&$/) { return 0 if !$number; }
if (/^\|\|$/) { return 1 if $number; }
my $operand = parse_macro($next, $env);
$_ = $op;
if (/^&&$/) { $number &= $operand; next; }
if (/^\|\|$/) { $number |= $operand; next; }
ldie "bad shortcut operator '$op'";
}
return $number;
}
if (/^([~!]|not)$/) { # unary operators
my $op = $1;
$op = "!" if $op eq "not";
my $number = parse_macro($arg1, $env);
$_ = $op;
if (/^~$/) { return ~$number; }
if (/^!$/) { return !$number; }
ldie "bad unary operator '$op'";
}
# string functions
if (/^toupper$/) {
my $txt = parse_macro($arg1, $env);
return uc($txt);
}
if (/^tolower$/) {
my $txt = parse_macro($arg1, $env);
return lc($txt);
}
if (/^length$/) { # string length
my $txt = parse_macro($arg1, $env);
return length($txt);
}
if (/^subst$/) { # regex substitution operator
my $txt = parse_macro($arg1, $env);
my $arg2 = shift;
my $regex = parse_macro($arg2, $env);
my $arg3 = shift;
my $subst = parse_macro($arg3, $env);
my $opts = "m";
my $arg4 = shift;
$opts = parse_macro($arg4, $env) if defined($arg4);
# unfortunately standard regex operators don't seem to accept variable options, so we use eval()
eval("\$txt =~ s{$regex}{$subst}$opts");
return $txt;
}
if (/s?printf$/) { # sprintf()
my $fmt = $arg1; # exception: don't evaluate the format string!
my @list = ();
while (defined(my $next = shift)) {
my $operand = parse_macro($next, $env);
push @list, $operand;
}
return sprintf($fmt, @list);
}
if (/^human[-_]?numbers?$/) { # convert numbers to readable format
my $name = $_;
my $unit = parse_macro($arg1, $env);
my $arg2 = shift;
my $delim_numbers = parse_macro($arg2, $env);
$delim_numbers = "/" if $delim_numbers eq "";
my $arg3 = shift;
my $delim_unit = parse_macro($arg3, $env);
$delim_unit = " " if $delim_unit eq "";
my $max = 0;
my @list = ();
while (defined(my $next = shift)) {
my $number = make_numeric(parse_macro($next, $env));
push @list, $number;
$max = $number if $number > $max;
}
lwarn "macro $name: no number arguments given\n" unless @list;
my $conv_unit = "";
my @results = ();
foreach my $number (@list) {
my $conv_number = number2human($number, $unit, $max);
$conv_number =~ s/ *([a-z].*)//i;
$conv_unit = $1;
push @results, $conv_number;
}
return join($delim_numbers, @results) . "$delim_unit$conv_unit";
}
if (/^human[-_]?seconds$/) { # convert numbers to readable format
# don't use make_numeric() here in order to allow the result '--:--:--'
my $number = parse_macro($arg1, $env);
return seconds2human($number);
}
if (/^progress$/) { # progress bar
my $length = make_numeric(parse_macro($arg1, $env));
$length = 20 unless ($length && $length > 0);
my $arg2 = shift;
if (!defined($arg2)) { # use default values
my @vector = split(':', eval_fn($env, "summary-vector", ":"));
return progress_bar($length, @vector);
}
my $min = make_numeric(parse_macro($arg2, $env));
my $arg3 = shift;
my $mid = make_numeric(parse_macro($arg3, $env));
my $arg4 = shift;
my $max = make_numeric(parse_macro($arg4, $env));
return progress_bar($length, $min, $mid, $max);
}
if (/^errno[-_]?text$/) {
my $code = parse_macro($arg1, $env);
return "" unless defined($code) && $code != 0;
$code = -$code if $code < 0;
return __conv_errno("", $code);
}
if (/^get[-_]?log[-_]?status/) {
return get_error_text($$env{"cmd"}, $$env{"res"});
}
if (/^get[-_]?resource[-_]?(fat|err|wrn)([-_]?count)?/) {
my $what = $1;
my $do_count = $2;
my %assoc = ("fat" => 4, "err" => 3, "wrn" => 2);
my $glob = $$env{"resdir"} . "/$assoc{$what}.*.status";
return _get_text($glob, undef, 0, $do_count);
}
if (/^warn/) {
my $txt = parse_macro($arg1, $env);
lwarn $txt;
return "";
}
if (/^die$/) {
my $txt = parse_macro($arg1, $env);
ldie $txt;
return "";
}
if (/^is-module-loaded$/) {
my $path = "/proc/sys/mars";
my $result = -d $path;
return defined($result) && $result;
}
if (/^is-(member|guest)$/) {
my $type = $1;
$arg1 = parse_macro($arg1, $env);
$arg1 = $$env{"res"} unless $arg1;
my $arg2 = shift;
$arg2 = parse_macro($arg2, $env);
$arg2 = $$env{"host"} unless $arg2;
my $result;
if ($type eq "guest") {
$result = is_guest($arg1, $arg2);
} else {
$result = is_member($arg1, $arg2);
}
return $result ? 1 : 0;
}
# list objects
if (/^(count[-_]?)?(cluster|resource|guest)[-_]?members$/) {
my $old = $_;
$_ =~ s/members/peers/;
lwarn "deprecated: please use macro '$_' instead of '$old'\n";
}
if (/^(count[-_]?)?(cluster|resource|guest)[-_]?peers$/) {
my $do_count = $1;
my $type = $2;
my @peers;
if ($type eq "cluster") {
@peers = get_total_peers();
} elsif ($type eq "guest") {
@peers = get_guest_peers($$env{"res"});
} else {
@peers = get_member_peers($$env{"res"});
}
return scalar(@peers) if defined($do_count);
my $list = "";
foreach my $peer (@peers) {
$list .= "$peer\n";
}
return $list;
}
if (/^(count[-_]?)?(my|all)[-_]?(resources|members|guests)$/) {
my $do_count = $1;
my $what = $2;
my $type = $3;
my $peer = "";
if ($what eq "my") {
$peer = parse_macro($arg1, $env);
$peer = $$env{"host"} unless $peer;
}
my @list;
if ($type eq "guests") {
@list = get_guest_resources($peer);
} elsif ($type eq "members") {
@list = get_member_resources($peer);
} else {
@list = get_total_resources($peer);
}
return scalar(@list) if defined($do_count);
my $list = "";
foreach my $item (@list) {
$list .= "$item\n";
}
return $list;
}
# low-level symlink access
if (/^readlink$/) {
my $path = parse_macro($arg1, $env);
return get_link($path, 1);
}
if (/^setlink$/) {
my $src = parse_macro($arg1, $env);
my $arg2 = shift;
my $dst = parse_macro($arg2, $env);
set_link($src, $dst);
return "";
}
# high-level state access
if (/^(get|todo|actual)[-_]?primary$/) {
my $op = $1;
my $primary;
if ($op eq "actual") {
lwarn "DEPRECATED: you are trying to uniquely identify an actual primary hostname, but this is conceptually wrong because in split-brain situations there may exist multiple ones. Use view-is-primary instead. That would be safe.\n" unless $$env{cmd} =~ m/-1and1$/;
$primary = _get_actual_primary($$env{"res"});
} else {
$primary = _get_designated_primary($$env{"res"});
}
$primary = "" if (!defined($primary) || $primary eq "(none)");
$primary = ($primary eq $$env{"host"}) if $op eq "todo";
return $primary;
}
if (/^todo[-_]?secondary$/) {
my $val = eval_fn($env, "get-primary", $arg1);
return $val eq "(none)" ? 1 : 0;
}
if (/^todo[-_]?(attach|sync|fetch|replay)?$/) {
my $what = $1;
$what = parse_macro($arg1, $env) unless defined($what);
my $lnk = $$env{"resdir"} . "/todo-" . $$env{"host"} . "/$what";
$lnk = correct_path($lnk);
return get_link($lnk, 1);
}
if (/^get[-_]?msg$/) {
my $what = parse_macro($arg1, $env);
my $lnk = $$env{"resdir"} . "/actual-" . $$env{"host"} . "/msg-$what";
return get_link($lnk, 1);
}
if (/^(all|the)[-_](pretty[-_]?)?(global[-_]?)?((?:err|wrn|inf)[-_])?(msg|count)$/) {
my $shorten = ($1 eq "the");
my $pretty = $2;
my $global = $3;
my $specific = $4;
my $type = $5;
$specific = "" unless defined($specific);
my $glob = "$mars";
$glob = $$env{"resdir"} if (!defined($global) && $$env{"res"});
$glob .= "/actual-" . $$env{"host"} . "/msg-$specific*";
my $result = "";
my $count = 0;
foreach my $msg_path (lamport_glob($glob)) {
my $val = get_link($msg_path, 1);
if ($shorten) {
# skip uninteresting messages
next if $val =~ m/^OK/;
# skip _transient_ error messages
if ($msg_path =~ m/-err-/ && $val =~ m/^([0-9]+\.[0-9]{9})/) {
my $stamp = $1;
if ($stamp) {
my $delta = $timeout > 0 ? $timeout : 30;
next if $stamp + $delta > time();
}
}
}
$val = _replace_timestamps($val, $shorten) if defined($pretty);
my $key = $msg_path;
$key =~ s:^.*/msg-::;
$result .= "$key: $val\n";
$count++;
}
return $count if $type eq "count";
return $result;
}
if (/^potential[-_]?features$/) {
my $all_flags = ~0x0;
my $result = featuresflags2txt($all_flags, 1);
return $result;
}
if (/^(implemented|usable)[-_]?(features|compressions|digests)$/) {
my $op = $1;
my $restrict = $2;
$op = "features" if $op eq "implemented";
my $str = get_alive_link($op, $$env{"host"}, 2);
my $flags;
if ($str =~ m/,(0x[0-9a-f]*)/) {
$flags = $1;
}
$flags = 0x0 unless (defined($flags) && $flags ne "");
$flags = hex($flags) if $flags =~ m/^0x/;
if ($restrict =~ m/compressions/) {
$flags &= $compress_features;
} elsif ($restrict =~ m/digests/) {
$flags &= $chksum_features;
}
my $result = featuresflags2txt($flags, 1);
return $result;
}
if (/^(enabled-(log|net)-compressions|disabled-(log|net)-digests)$/) {
my $op = $1;
my $path = "$mars/defaults/$op";
my $flags = get_link($path, 2);
$flags = 0x0 unless (defined($flags) && $flags ne "");
$flags = hex($flags) if $flags =~ m/^0x/;
#NYI
#$path = "$mars/resource-" . $$env{"res"} . "/defaults/$op";
#my $res_flags = get_link($path, 2);
#if (defined($res_flags) && $res_flags ne "") {
#$res_flags = hex($res_flags) if $res_flags =~ m/^0x/;
#if ($op =~ m/enabled/) {
#$flags |= $res_flags;
#} else {
#$flags = $res_flags;
#}
#}
if ($op =~ m/compressions/) {
$flags &= $compress_features;
} elsif ($op =~ m/digests/) {
$flags &= $chksum_features;
}
my $unusable = $flags & ~$kernel_flags_version;
if ($unusable > 0) {
my $txt = featuresflags2txt($unusable);
lwarn "features '$txt' are not compiled into the kernel module\n";
}
my $result = featuresflags2txt($flags);
return $result;
}
if (/^used-((log|net)-(compression|digest))$/) {
my $op = $1;
my $flags = get_alive_link("used-$op", $$env{"host"}, 2);
$flags = 0x0 unless (defined($flags) && $flags ne "");
$flags = hex($flags) if $flags =~ m/^0x/;
if ($op =~ m/compression/) {
$flags &= $compress_features;
} elsif ($op =~ m/digest/) {
$flags &= $chksum_features;
}
my $result = featuresflags2txt($flags);
return $result;
}
if (/^(tree|features|)[-_]?version$/) {
my $op = $1;
my $result = get_alive_link($op, $$env{"host"}, 2);
if ($result =~ m/^([^,]*,[^,]*,)(0x[0-9a-z]+),(0x[0-9a-z]+)/p) {
my $res1 = $1;
my $res2 = $2;
my $res3 = $3;
my $rest = $POSTMATCH;
$result = $res1 .
featuresflags2txt($res2) .
"," .
featuresflags2txt($res3) .
$rest;
}
return $result;
}
if (/^is[-_]?alive$/) {
my $peer = parse_macro($arg1, $env);
$peer = _get_designated_primary($$env{"res"}) unless $peer;
$peer = $$env{"host"} unless $peer;
my $stamp = get_alive_link("time", $peer, 1);
return is_recent($stamp, $$env{"window"});
}
if (/^alive[-_]?timestamp$/) {
my $peer = parse_macro($arg1, $env);
$peer = _get_designated_primary($$env{"res"}) unless $peer;
return -1 if !$peer || $peer eq "(none)";
my $result = get_alive_link("time", $peer, 2);
return $result if $result;
return get_alive_stamp("alive", $peer);
}
if (/^is[-_]?orphan$/) {
my $peer = parse_macro($arg1, $env);
$peer = $$env{"host"} unless $peer;
return 0 if eval_fn($env, "is-primary", $peer);
my $replay = get_link($$env{"resdir"} . "/replay-$peer", 1);
$replay =~ m/^(log-[^,]+),([0-9]*)/;
my $logfile = $$env{"resdir"} . "/" . $1;
my $logpos = $2;
if (! -r $logfile) {
return 1;
}
my @stat = stat($logfile);
if (!@stat) {
return 1;
}
my $size= $stat[7];
if ($size < $logpos) {
return 1;
}
return 0;
}
if (/^is[-_]?(almost[-_]?)?consistent$/) {
my $almost = $1;
# has sync finished?
my $syncrest = make_numeric(eval_fn($env, "sync-rest", ""));
return 0 if $syncrest > 0;
if (!$almost && eval_fn($env, "is-primary", "")) {
# is the replay link indicating that something is not yet applied / dirty?
my $replay = get_link($$env{"resdir"} . "/replay-" . $$env{"host"}, 1);
$replay =~ m:,[0-9]+,([0-9]+):;
my $rest = $1;
return 0 if $rest > 0;
}
# are all logfiles applied which had accumulated during sync?
my $replay = get_link($$env{"resdir"} . "/replay-" . $$env{"host"}, 1);
return 0 unless $replay;
my $syncpos = get_link($$env{"resdir"} . "/syncpos-" . $$env{"host"}, 2);
if (defined($syncpos) && $syncpos) {
my $cmp = compare_replaylinks($syncpos, $replay);
return 0 if $cmp > 0;
}
return 1;
}
if (/^get[-_]?disk$/) {
my $lnk = $$env{"resdir"} . "/data-" . $$env{"host"};
my $result = get_link($lnk, 1);
$result = "" unless defined($result);
return $result;
}
if (/^get[-_]?device$/) {
my $result = device_name($$env{"res"}, $$env{"host"});
return $result;
}
if (/^disk[-_]?error$/) {
my $lnk = $$env{"resdir"} . "/actual-" . $$env{"host"} . "/disk-error";
my $result = get_link($lnk, 2);
return $result;
}
if (/^disk[-_]?present$/) {
my $lnk = $$env{"resdir"} . "/data-" . $$env{"host"};
my $result = get_link($lnk, 1);
$result = "" unless defined($result);
if ($result) {
# TODO: make this remotely retrievable via status link
$result = -b $result;
$result = "0" unless defined($result);
}
return $result;
}
if (/^device[-_]?present$/) {
my $result = device_exists($$env{"res"}, $$env{"host"});
return $result;
}
# deprecated (irregular names)
if (/^present[-_]?(disk|device)$/) {
my $what = $1;
return eval_fn($env, "$what-present", $arg1);
}
if (/^(device)[-_]?(opened|nrflying|error|completion-stamp|completion-age)$/) {
my $what = $1;
my $op = $2;
my $peer = $$env{"host"};
my %transl =
(
"opened" => "open-count",
"nrflying" => "if-flying",
"error" => "if-state",
"completion-stamp" => "if-completion-stamp",
"completion-age" => "if-completion-stamp",
);
if ($what eq "device") {
my $other = eval_fn($env, "get-device", $arg1);
if ($other =~ m/\@(.*)/) {
$peer = $1;
}
}
my $lnk = $$env{"resdir"} . "/actual-$peer/" . $transl{$op};
my $result = get_link($lnk, 2);
return 0 unless $result;
if ($op =~ m/-age/) {
$result = mars_time() - $result;
$result = 0 if $result < 0;
}
return $result;
}
if (/^is[-_]?split([-_]?brain)?$/) {
my $split = detect_splitbrain($$env{"res"}, 0);
return $split ? 0 : 1;
}
if (/^is[-_]?(attach|sync|fetch|replay|primary|secondary|emergency)$/) {
my $what = $1;
my $is = "is";
$is = "has" if $what eq "emergency";
my $peer = parse_macro($arg1, $env);
$peer = $$env{"host"} unless $peer;
my $lnk = $$env{"resdir"} . "/actual-$peer/$is-$what";
$lnk = correct_path($lnk);
my $val = get_link($lnk, 1);
$val = $val ? 0 : 1 if $what eq "secondary";
return $val;
}
if (/^nr[-_]?(attach|sync|fetch|replay|primary|secondary)$/) {
my $what = $1;
my $is = "is";
$is = "has" if $what eq "emergency";
my $nr = 0;
foreach my $peer (get_member_peers($$env{"res"})) {
my $lnk = $$env{"resdir"} . "/actual-$peer/$is-$what";
$lnk = correct_path($lnk);
my $val = get_link($lnk, 1);
$val = !$val if $what eq "secondary";
$nr++ if $val;
}
return $nr;
}
if (/^does$/) {
my $what = parse_macro($arg1, $env);
my $is = "is";
$is = "has" if $what eq "emergency";
my $lnk = $$env{"resdir"} . "/actual-" . $$env{"host"} . "/$is-$what";
$lnk = correct_path($lnk);
return get_link($lnk, 1);
}
if (/^(tree|rest-space)$/) {
my $what = $1;
return get_alive_link($what, $$env{"host"}, 1)
}
if (/^systemd[-_]?unit$/) {
return _get_systemd_unit($$env{"cmd"}, $$env{"res"});
}
if (/^(uuid)$/) {
my $what = $1;
my $lnk = "$mars/$what";
$lnk = correct_path($lnk);
return get_link($lnk, 1);
}
if (/^replay[-_]?code$/) {
my $lnk = $$env{"resdir"} . "/actual-" . $$env{"host"} . "/replay-code";
return get_link($lnk, 2);
}
if (/^(device|sync|fetch|replay|work)[-_]?(rate|ops[-_]?rate|amount[-_]?rate|remain)$/) {
my $what = $1;
my $select = $2;
if ($what eq "work") {
my $val1 = eval_fn($env, "fetch-$select", "");
my $val2 = eval_fn($env, "replay-$select", "");
return "" if (!defined($val1) || $val1 eq "");
return "" if (!defined($val2) || $val2 eq "");
$val1 = make_numeric($val1);
$val2 = make_numeric($val2);
return $val1 + $val2 if $select eq "remain";
# take the maximum rate
return $val1 if $val1 > $val2;
return $val2;
}
my $name = $select;
$name = "amount_rate" if ($name eq "rate" || $name eq "remain");
$name =~ s/-/_/;
my %names =
(
"device" => "if",
"sync" => "sync",
"fetch" => "file",
"replay" => "replay",
);
$name =~ s/_/-$names{$what}_/;
my $lnk = $$env{"resdir"} . "/actual-" . $$env{"host"} . "/$name";
my $rate = get_link($lnk, 2);
# deprecated: compatibility with old version
if (!defined($rate) || $rate eq "") {
$lnk =~ s:/amount-:/:;
$rate = get_link($lnk, 2);
}
if ($select eq "remain") {
my $rest = make_numeric(eval_fn($env, "$what-rest", ""));
return 0 if $rest <= 0;
return -1 if (!defined($rate) || $rate eq "" || $rate <= 0);
return $rest / 1024 / $rate;
}
if ($select =~ /rate/) {
return 0 if (!defined($rate) || !$rate || $rate <= 0);
return $rate if $select =~ /^ops-/;
return $rate * 1024;
}
ldie "unknown macro $_\n";
}
if (/^sync[-_]?size$/) {
my $lnk = $$env{"resdir"} . "/size";
return get_link($lnk, 1);
}
if (/^sync[-_]?pos$/) {
my $lnk = $$env{"resdir"} . "/syncstatus-" . $$env{"host"};
return get_link($lnk, 1);
}
if (/^(replay)[-_]?(lognr|basenr)$/) {
my $op = $2;
my ($sum, $firstpos, $logpos, $level) = get_amount($$env{"resdir"}, $$env{"host"}, -1, 1);
return $firstpos if $op eq "lognr";
return $logpos;
}
if (/^(fetch)[-_]?(lognr)$/) {
my ($sum, $firstpos, $logpos, $level) = get_amount($$env{"resdir"}, $$env{"host"}, 1, 1);
return $logpos;
}
if (/^(work)[-_]?(lognr)$/) {
my ($sum, $firstpos, $logpos, $level) = get_amount($$env{"resdir"}, $$env{"host"}, 1, 0);
return $logpos;
}
if (/^replay[-_]?logcount$/) {
return eval_fn($env, "replay-lognr", "") - eval_fn($env, "replay-basenr", "");
}
if (/^(fetch|work)[-_]?logcount$/) {
my $what = $1;
return eval_fn($env, "$what-lognr", "") - eval_fn($env, "replay-lognr", "");
}
if (/^writeback[-_]?rest$/) {
my ($sum, $firstpos, $logpos, $level) = get_amount($$env{"resdir"}, $$env{"host"}, 1, 0);
return $sum;
}
if (/^(replay|work)[-_]?(pos)$/) {
my $what = $1;
my $op = $2;
my ($sum, $firstpos, $logpos, $level) = get_amount($$env{"resdir"}, $$env{"host"}, -1, 0);
return $sum;
}
if (/^(replay[-_]?size)|(fetch[-_]?pos)$/) {
my ($sum0) = get_amount($$env{"resdir"}, $$env{"host"}, -1, 0);
my ($sum, $firstpos, $logpos, $level) = get_amount($$env{"resdir"}, $$env{"host"}, 1, 1);
return $sum0 + $sum;
}
if (/^(fetch|work)[-_]?(size)$/) {
my $what = $1;
my ($sum0) = get_amount($$env{"resdir"}, $$env{"host"}, -1, 0);
my ($sum, $firstpos, $logpos, $level) = get_amount($$env{"resdir"}, $$env{"host"}, 1, 0);
return $sum0 + $sum;
}
if (/^(sync|fetch|replay|work)[-_]?(rest|(?:almost[-_]?|threshold[-_]?)?reached|percent|permille|vector)$/) {
my $what = $1;
my $op = $2;
my $size = make_numeric(eval_fn($env, "$what-size", ""));
my $pos = make_numeric(eval_fn($env, "$what-pos", ""));
my $result = 0;
if ($op eq "rest") {
$result = $size - $pos if $pos < $size;
} elsif ($op =~ m/^almost/) {
my $limit = make_numeric(parse_macro($arg1, $env)) if $arg1 ne "";
$limit = 990 if $limit <= 0;
$result = 1 if int($pos / $limit) >= int($size / 1000);
} elsif ($op =~ m/^threshold/) {
my $my_threshold = make_numeric($$env{"threshold"});
$arg1 = parse_macro($arg1, $env);
$my_threshold = make_numeric(get_size($arg1)) if $arg1 ne "";
$result = 1 if $pos + $my_threshold >= $size;
} elsif ($op eq "reached") {
$result = 1 if $pos >= $size;
} elsif ($op eq "percent") {
$result = 100;
$result = 100.5 * $pos / $size if $size > 0;
$result = 100 if $result >= 100.0;
} elsif ($op eq "permille") {
$result = 1000;
$result = 1000.5 * $pos / $size if $size > 0;
$result = 1000 if $result >= 1000.0;
} elsif ($op eq "vector") {
my $delim = parse_macro($arg1, $env);
$delim = ":" unless $delim;
$result = "$pos$delim$size";
} else {
ldie "unknown operation '$op'\n";
}
return $result;
}
if (/^summary[-_]?vector$/) {
my $pos1 = make_numeric(eval_fn($env, "replay-pos", ""));
my $pos2 = make_numeric(eval_fn($env, "fetch-pos", ""));
my $size = make_numeric(eval_fn($env, "fetch-size", ""));
my $delim = parse_macro($arg1, $env);
$delim = ":" unless $delim;
return "$pos1$delim$pos2$delim$size";
}
if (/^deletable[-_]?size$/) {
my ($min, $max) = _get_deletable_logfiles($_, $$env{"res"});
my $sum = 0;
foreach my $path (lamport_glob("$mars/resource-" . $$env{"res"} . "/log-*")) {
$path =~ m/\/log-([0-9]+)-/;
my $nr = $1;
next if $nr < $min or $nr >= $max;
my @stat = stat($path);
$sum += $stat[7];
}
return $sum;
}
if (/^occupied[-_]?size$/) {
my $sum = 0;
foreach my $path (lamport_glob("$mars/resource-" . $$env{"res"} . "/log-*")) {
my @stat = stat($path);
$sum += $stat[7];
}
return $sum;
}
if (/^(disk|resource|device)[-_]?size$/) {
my $what = $1;
my $path = "$mars/resource-" . $$env{"res"};
if ($what eq "device" && eval_fn($env, "device-present", "")) {
return _get_mars_size($$env{"cmd"}, $$env{"res"});
} elsif ($what eq "disk"){
my $peer = parse_macro($arg1, $env);
$peer = $$env{"host"} unless $peer;
$path .= "/actsize-$peer";
} else {
$path .= "/size";
}
return get_link($path, 1);
}
if (/^resource[-_]?possible[-_]?size$/) {
return get_possible_size($$env{"cmd"}, $$env{"res"});
}
# time handling and pausing
if (/^time$/) {
return mars_time();
}
if (/^real[-_]?time$/) {
my $time = time();
return $time;
}
if (/^replay[-_]?timestamp$/) {
$arg1 = parse_macro($arg1, $env);
$arg1 = $$env{"host"} unless $arg1;
my $replay = $$env{"resdir"} . "/replay-$arg1";
return get_link_stamp($replay);
}
if (/^fetch[-_]?timestamp$/) {
$arg1 = parse_macro($arg1, $env);
$arg1 = $$env{"host"} unless $arg1;
my $fetch_glob = $$env{"resdir"} . "/version-*-$arg1";
my @paths = sort alphanum_cmp lamport_glob($fetch_glob);
return -1 if !@paths;
my $fetch = pop(@paths);
return get_link_stamp($fetch);
}
if (/^work[-_]?timestamp$/) {
my $time1 = make_numeric(eval_fn($env, "fetch-timestamp", $arg1));
my $time2 = make_numeric(eval_fn($env, "replay-timestamp", $arg1));
# use the "best" of both...
return $time1 if $time1 > $time2;
return $time2;
}
if (/^(fetch|replay|work|alive)[-_]?age$/) {
my $what = $1;
my $time = make_numeric(eval_fn($env, "$what-timestamp", $arg1));
return -1 if $time <= 0;
return int(mars_time()) - $time;
}
if (/^(fetch|replay|work|alive)[-_]?lag$/) {
my $what = $1;
$arg1 = $$env{"host"} unless $arg1;
my $arg2 = shift;
my $primary = parse_macro($arg2, $env);
if (!$primary) {
$primary = _get_designated_primary($$env{"res"});
}
return -1 if !$primary || $primary eq "(none)";
my $time1 = make_numeric(eval_fn($env, "$what-timestamp", $arg1));
my $time2 = make_numeric(eval_fn($env, "$what-timestamp", $primary));
return -1 if ($time1 <= 0 || $time2 <= 0);
return 0 if $time1 < $time2;
return $time1 - $time2;
}
if (/^sleep$/) {
my $time = parse_macro($arg1, $env);
sleep($time);
return "";
}
if (/^timeout$/) {
my $time = parse_macro($arg1, $env);
sleep_timeout($time);
return "";
}
if (/^wait[-_]?((?:todo|is)[-_](?:attach|sync|fetch|replay|primary|secondary)[-_](?:on|off))$/) {
my $specific = $1;
$specific =~ s/_/-/g;
wait_cond($$env{"cmd"}, $$env{"res"}, $specific);
return "";
}
if (/^wait(?:[-_]?resource)?$/) {
my $specific = parse_macro($arg1, $env);
wait_cond($$env{"cmd"}, $$env{"res"}, $specific);
return "";
}
if (/^wait[-_]?cluster$/) {
my $specific = parse_macro($arg1, $env);
wait_cluster($$env{"cmd"}, $$env{"res"}, $specific);
return "";
}
# generic flow control and loops
if (/^(get|foreach)[-_]?glob$/) {
my $op = $1;
my $paths = parse_macro($arg1, $env);
my $arg2 = shift;
my $varname = parse_macro($arg2, $env);
my $arg3 = shift;
my @list = lamport_glob($paths);
my $result = "";
if ($op eq "get") {
my $delim = parse_macro($arg3, $env);
foreach my $path (@list) {
$result .= $delim if $result;
$result .= $path;
}
} else { # foreach
foreach my $path (@list) {
$$env{$varname} = $path;
$result .= parse_macro($arg3, $env);
}
}
return $result;
}
if (/^(if|unless)$/) {
my $op = $1;
my $cond = parse_macro($arg1, $env);
$cond = !$cond if $op eq "unless";
my $arg2 = shift;
if ($cond) {
ldie "undefined $op-part\n" unless defined($arg2);
return parse_macro($arg2, $env);
} elsif (defined(my $arg3 = shift)) {
return parse_macro($arg3, $env);
}
return "";
}
if (/^else?(if|unless)$/) {
my $op = $1;
unshift @_, $arg1;
while (defined(my $arg1 = shift)) {
if (defined(my $arg2 = shift)) {
my $cond = parse_macro($arg1, $env);
$cond = !$cond if $op eq "unless";
if ($cond) {
return parse_macro($arg2, $env);
}
} else { # odd number of arguments is treated as final "else"
return parse_macro($arg1, $env);
}
}
return "";
}
if (/^while$/) {
my $arg2 = shift;
my $result = "";
while (parse_macro($arg1, $env)) {
$result .= parse_macro($arg2, $env);
next if _control_macro($env, "__next__");
last if _control_macro($env, "__last__");
}
return $result;
}
if (/^until$/) {
my $arg2 = shift;
my $result = "";
until (parse_macro($arg1, $env)) {
$result .= parse_macro($arg2, $env);
next if _control_macro($env, "__next__");
last if _control_macro($env, "__last__");
}
return $result;
}
if (/^for$/) {
my ($arg2, $arg3, $arg4) = (shift, shift, shift);
my $result = "";
for (parse_macro($arg1, $env); parse_macro($arg2, $env); parse_macro($arg3, $env)) {
$result .= parse_macro($arg4, $env);
next if _control_macro($env, "__next__");
last if _control_macro($env, "__last__");
}
return $result;
}
if (/^foreach$/) {
my $varname = parse_macro($arg1, $env);
my $arg2 = shift;
my $txt = parse_macro($arg2, $env);
my $arg3 = shift;
my $delim = parse_macro($arg3, $env);
my $arg4 = shift;
my $result = "";
foreach my $value (split($delim, $txt)) {
$$env{$varname} = $value;
$result .= parse_macro($arg4, $env);
next if _control_macro($env, "__next__");
last if _control_macro($env, "__last__");
}
return $result;
}
if (/^protect$/) { # don't evaluate argument, take verbatim
return $arg1;
}
if (/^eval$/) { # evaluate given number of times
my $count = parse_macro($arg1, $env);
my $arg2 = shift;
while ($count-- > 0) {
$arg2 = parse_macro($arg2, $env);
}
return $arg2;
}
if (/^eval[-_]?down$/) { # evaluate until result is stable
for (;;) {
my $old = $arg1;
$arg1 = parse_macro($arg1, $env);
last if $arg1 eq $old;
}
return $arg1;
}
if (/^tmp$/) { # evaluate once in a temporary scope
my %copy_env = %$env;
my $result = parse_macro($arg1, \%copy_env);
return $result;
}
if (/^(include|call)$/) {
my $op = $1;
my $name = parse_macro($arg1, $env);
my $txt = get_macro($name);
if ($op eq "call") { # run in new sub-scope
my %copy_env = %$env;
set_args($env, \%copy_env, $name, @_);
return parse_macro($txt, \%copy_env);
}
# 'include' runs in the same scope
set_args($env, $env, $name, @_);
return parse_macro($txt, $env);
}
if (/^callstack$/) {
return $$env{"callstack"};
}
if (/^(abort|return|stop-eval)$/) {
my $op = $1;
$$env{$op} = 1;
return "";
}
if (/^(next|last)$/) {
my $op = $1;
$$env{$op} = 1;
$$env{"__return__"} = 1;
return "";
}
ldie "call to unknown macro '$fn'\n";
}
}
##################################################################
# macro parsing
sub _control_macro {
my $env = shift;
my $control = shift;
my $result = $$env{$control};
$$env{$control} = 0;
return $result;
}
sub parse_macro {
my ($text, $env) = @_;
$text = "" unless defined($text);
my $old_callstack = $$env{"callstack"};
my $result = "";
while ($text =~ m/\\(.)|$match_fn/sp) {
my $escape = $1;
my $fn = $2;
my @args = ($3);
my $pre = $PREMATCH;
my $post = $POSTMATCH;
if (defined($escape)) {
$result .= $pre;
$text = $post;
$_ = $escape;
if (/[tnrfbae]/) {
eval "\$result .= \"\\$escape\"";
next;
}
if (/[a-zA-Z]/) {
lwarn "control sequence '\\$escape' is reserved for future use\n";
}
$result .= $escape;
next;
}
return "" if _control_macro($env, "__abort__");
return $result if _control_macro($env, "__return__");
return $result . $text if _control_macro($env, "__stop_eval__");
$result .= $pre;
$text = $post;
while ($text =~ m/\A\{($match_inner)\}/sp ) {
push @args, $1;
$text = $POSTMATCH;
}
my $new = eval_fn($env, $fn, (@args));
ldie "undefined result from evaluation of primitive macro '$fn'\n" unless defined($new);
$$env{"callstack"} = $old_callstack;
$result .= $new;
}
return "" if _control_macro($env, "__abort__");
return $result if _control_macro($env, "__return__");
return $result . $text if _control_macro($env, "__stop_eval__");
return $result . $text;
}
sub make_env {
my ($cmd, $res, $text) = (shift, shift, shift);
$text =~ s{$match_comment}{}sg;
my $ip = _get_ip($host);
my %start_env =
(
"cmd" => $cmd,
"res" => $res,
"resdir" => "$mars/resource-$res",
"mars" => $mars,
"host" => $host,
"real_host" => $real_host,
"ip" => $ip,
"timeout" => $timeout,
"threshold" => $threshold,
"window" => $window,
"force" => $force,
"dry-run" => $dry_run,
"verbose" => $verbose,
"callstack" => "",
# internal, deliberately not documented
"__abort__" => 0,
"__return__" => 0,
"__stop_eval__" => 0,
"__next__" => 0,
"__last__" => 0,
);
set_args(\%start_env, \%start_env, $cmd, @_);
return ($text, \%start_env);
}
sub eval_macro {
my ($text, $start_env) = make_env(@_);
return parse_macro($text, $start_env);
}
##################################################################
# macro commands
my $macro = "";
my %complex_macros =
(
"default"
=> ""
. "%elsif{%is-member{}}{"
. "%call{device-info}"
. " %{res} [%count-resource-peers{%{res}}/%count-cluster-peers{}]"
. " %include{diskstate} %include{replstate} %include{flags} %include{role} %include{primarynode} %include{commstate}\n"
. "%if{%>{%-{%disk-size{}}{%resource-size{}}}{%{threshold}}}{"
. " Hint: you are wasting %human-numbers{}{ }{ }{%-{%disk-size{}}{%resource-size{}}} on disk %get-disk{}\n"
. "}"
. "%if{%>{%resource-possible-size{}}{%resource-size{}}}{"
. " Hint: marsadm resize would increase resource %{res} by %human-numbers{}{ }{ }{%-{%resource-possible-size{}}{%resource-size{}}}\n"
. "}"
. "%if{%and{%not{%is-primary{}}}{%is-attach{}}}{"
. "%if{%not{%sync-reached{}}}{"
. "%include{syncinfo}"
. "}"
. "%if{%not{%work-reached{}}}{"
. "%include{replinfo}"
. "}"
. "}"
. "%call{resource-errors}"
. "}{"
. "%call{device-info}"
. "}",
"default-resource"
=> "%if{%{res}}{"
. "%{res} %human-numbers{}{ }{ }{%resource-size{}} "
. "[%count-resource-peers{%{res}}/%count-cluster-peers{}]"
. "}",
"default-global"
=> "%call{comminfo}",
"default-header"
=> "%if{%usable-compressions{}}{"
. "used-log-compression=\"%used-log-compression{}\"\n"
. "%if{%{verbose}}{"
. " usable: \"%usable-compressions{}\"\n"
. " enabled: \"%enabled-log-compressions{}\"\n"
. "}"
. "used-net-compression=\"%used-net-compression{}\"\n"
. "%if{%{verbose}}{"
. " usable: \"%usable-compressions{}\"\n"
. " enabled: \"%enabled-net-compressions{}\"\n"
. "}"
. "}"
. "%if{%usable-digests{}}{"
. "used-log-digest=\"%used-log-digest{}\"\n"
. "%if{%{verbose}}{"
. " usable: \"%usable-digests{}\"\n"
. " disabled: \"%disabled-log-digests{}\"\n"
. "}"
. "used-net-digest=\"%used-log-digest{}\"\n"
. "%if{%{verbose}}{"
. " usable: \"%usable-digests{}\"\n"
. " disabled: \"%disabled-net-digests{}\"\n"
. "}"
. "}",
"default-footer"
=> "",
"1and1"
=> "%if{%{res}}{"
. " %{res} %include{diskstate-1and1} %include{replstate-1and1} %include{flags-1and1} %include{role-1and1} %include{primarynode-1and1}\n"
. "%if{%and{%is-attach{}}{%not{%sync-reached{}}}}{"
. "%include{syncinfo-1and1}"
. "}"
. "%if{%and{%is-attach{}}{%not{%work-threshold-reached{}}}}{"
. "%include{replinfo-1and1}"
. "}"
. "%call{resource-errors-1and1}"
. "}{"
. "%the-pretty-global-msg{}"
. "}",
"device-stats"
=> ""
. "%if{%device-opened{}}{"
. "Opened, "
. "%if{%device-error{}}{"
. "ERROR %device-error{} %errno-text{%device-error{}}, "
. "}"
. "%device-ops-rate{} IOPS"
. "%if{%device-nrflying{}}{"
. ", %device-nrflying{} Flying"
. "%if{%>{%device-completion-age{}}{%{window}}}{"
. ", HANGING age: %human-seconds{%device-completion-age{}}"
. "}"
. "}"
. "}{"
. "Closed"
. "}",
"device-info"
=> "%if{%device-present{}}{"
. " LocalDevice %get-device{}"
. ""
. " [%call{device-stats}]\n"
. "}",
"diskstate"
=> "%if{%disk-error{}}{"
. "DISK_ERROR %disk-error{} %errno-text{%disk-error{}} "
. "}"
. "%elsif{%not{%get-disk{}}}{"
. "NotJoined"
. "}{%not{%disk-present{}}}{"
. "NotPresent"
. "}{%not{%todo-attach{}}}{"
. "%if{%is-attach{}}{"
. "Detaching"
. "}{"
. "Detached"
. "}"
. "}{%and{%replay-code{}}{%<{%replay-code{}}{0}}}{"
. "DefectiveLog[%errno-text{%replay-code{}}]"
. "}{%is-orphan{}}{"
. "Orphan"
. "}{%not{%is-attach{}}}{"
. "NoAttach"
. "}{%not{%is-consistent{}}}{"
. "%elsif{%not{%todo-primary{}}}{"
. "InConsistent"
. "}{%is-primary{}}{"
. "WriteBack[%human-numbers{}{ }{ }{%writeback-rest{}}]"
. "}{"
. "Recovery"
. "}"
. "}{%not{%work-reached{}}}{"
. "%elsif{%not{%todo-primary{}}}{"
. "OutDated[%call{outdated-flags}]"
. "}{%is-primary{}}{"
. "WriteBack[%human-numbers{}{ }{ }{%writeback-rest{}}]"
. "}{"
. "Recovery"
. "}"
. "}{%and{%ne{%get-primary{}}{}}{%tmp{%let{host}{%get-primary{}}%is-emergency{}}}}{"
. "EmergencyMode"
. "}{"
. "UpToDate"
. "}",
"diskstate-1and1"
=> "%if{%disk-present{}}{"
. "%if{%does{attach}}{"
. "%if{%is-almost-consistent{}}{"
. "%if{%work-reached{}}{"
. "Uptodate"
. "}{"
. "Outdated[%call{outdated-flags-1and1}]"
. "}"
. "}{Inconsistent}"
. "}{Detached}"
. "}{Detached}",
"outdated-flags"
=> "%if{%fetch-reached{}}{}{F}%if{%replay-reached{}}{}{R}",
"outdated-flags-1and1"
=> "%if{%fetch-reached{}}{}{F}%if{%replay-reached{}}{}{R}",
"replstate"
=> "%elsif{%not{%is-module-loaded{}}}{"
. "ModuleNotLoaded"
. "}{%not{%is-alive{%{host}}}}{"
. "UnResponsive"
. "}{%not{%get-disk{}}}{"
. "NotJoined"
. "}{%not{%todo-attach{}}}{"
. "NotStarted"
. "}{%todo-primary{}}{" # I am designated primary
. "%elsif{%is-emergency{}}{"
. "EmergencyMode"
. "}{%is-primary{}}{"
. "Replicating"
. "}{"
. "NotYetPrimary"
. "}"
. "}{" # Secondary
. "%elsif{%and{%sync-rest{}}{%not{%todo{sync}}}}{"
. "PausedSync"
. "}{%is-sync{}}{"
. "Syncing"
. "}{%not{%todo{fetch}}}{"
. "PausedFetch"
. "}{%not{%todo{replay}}}{"
. "PausedReplay"
. "}{%not{%get-primary{}}}{"
. "NoPrimaryDesignated"
. "}{%not{%is-alive{}}}{"
. "PrimaryUnreachable"
. "}{%is-orphan{}}{"
. "Orphan"
. "}{"
. "Replaying"
. "}"
. "}",
"replstate-1and1"
=> "%if{%disk-present{}}{"
. "%if{%is-primary{}}{"
. "Replicating"
. "}{"
. "%if{%is-alive{}}{"
. "%if{%and{%not{%sync-reached{}}}{%not{%todo{sync}}}}{"
. "PausedSync"
. "}{"
. "%if{%does{sync}}{"
. "Syncing"
. "}{"
. "%unless{%and{%todo-fetch{}}{%todo-replay{}}}{"
. "PausedReplay"
. "}{Replaying}"
. "}"
. "}"
. "}{PrimaryUnreachable}"
. "}"
. "}{NotJoined}",
"flags"
=> "%if{%disk-present{}}{%if{%device-present{}}{D}{d}}{-}"
. "%if{%is-consistent{}}{C}{%if{%disk-present{}}{c}{-}}"
. "%if{%does{attach}}{%if{%todo{attach}}{A}{a}}{%if{%todo{attach}}{a}{-}}"
. "%if{%sync-reached{}}{S}{%if{%todo{sync}}{s}{-}}"
. "%if{%fetch-reached{}}{F}{%if{%todo{fetch}}{f}{-}}"
. "%if{%replay-reached{}}{R}{%if{%todo{replay}}{r}{-}}",
"flags-1and1"
=> "-%if{%todo{sync}}{S}{-}%if{%todo{fetch}}{F}{-}%if{%todo{replay}}{R}{-}-",
"todo-role"
=> "%if{%disk-present{}}{"
. "%if{%todo-primary{}}{"
. "Primary"
. "}{"
. "Secondary"
. "}"
. "}{None}",
"role"
=> "%if{%disk-present{}}{"
. "%if{%todo-primary{}}{"
. "%if{%is-primary{}}{"
. "%if{%>{%nr-primary{}}{1}}{"
. "Forced"
. "}"
. "Primary"
. "}{"
. "NotYetPrimary"
. "}"
. "}{"
. "%if{%is-primary{}}{"
. "RemainsPrimary"
. "}{"
. "Secondary"
. "}"
. "}"
. "}{None}",
"role-1and1"
=> "%if{%disk-present{}}{"
. "%if{%is-primary{}}{"
. "Primary"
. "}{"
. "Secondary"
. "}"
. "}{Secondary}",
"primarynode"
=> "%if{%todo-primary{}}{"
. "%{host}"
. "}{"
. "%get-primary{}"
. "}",
"primarynode-1and1"
=> "%if{%disk-present{}}{"
. "%if{%is-primary{}}{"
. "%{host}"
. "}{"
. "%if{%actual-primary{}}{"
. "%actual-primary{}"
. "}{-}"
. "}"
. "}{-}",
"commstate"
=> "%let{comm}{%alive-age{}}"
. "%if{%>={%{comm}}{%{window}}}{"
. "%human-seconds{%{comm}}"
. "}",
"syncinfo"
=> "%let{amount}{%human-numbers{}{ }{ }{%sync-pos{}}{%sync-size{}}}"
. "%let{rate}{%human-numbers{}{ }{ }{%sync-rate{}}}"
. "%sprintf{ syncing: %s %.2f%% (%d/%d)%s rate: %.2f %s/sec remaining: %s hrs\n}"
. "{%progress{20}{%sync-pos{}}{0}{%sync-size{}}}"
. "{%sync-percent{}}"
. "{%{amount}{ }{0}}"
. "{%{amount}{ }{1}}"
. "{%{amount}{ }{2}}"
. "{%{rate}{ }{0}}"
. "{%{rate}{ }{1}}"
. "{%human-seconds{%sync-remain{}}}"
. "%call{sync-line}",
"syncinfo-1and1"
=> "%let{amount}{%human-numbers{}{ }{ }{%sync-pos{}}{%sync-size{}}}"
. "%let{rate}{%human-numbers{}{ }{ }{%sync-rate{}}}"
. "%sprintf{ syncing: %s %.2f%% (%d/%d)%s rate: %.2f %s/sec remaining: %s hrs\n}"
. "{%progress{20}{%sync-pos{}}{0}{%sync-size{}}}"
. "{%sync-percent{}}"
. "{%{amount}{ }{0}}"
. "{%{amount}{ }{1}}"
. "{%{amount}{ }{2}}"
. "{%{rate}{ }{0}}"
. "{%{rate}{ }{1}}"
. "{%human-seconds{%sync-remain{}}}"
. "%call{sync-line-1and1}",
"replinfo"
=> "%let{amount}{%human-numbers{}{ }{ }{%replay-pos{}}{%fetch-size{}}}"
. "%let{logs}{%replay-lognr{}}"
. "%let{l2}{%fetch-lognr{}}"
. "%let{l3}{%work-lognr{}}"
. "%if{%>{%{l2}}{%{logs}}}{%append{logs}{::%{l2}}}"
. "%if{%>{%{l3}}{%{l2}}}{%append{logs}{..%{l3}}}"
. "%sprintf{ replaying: %s %.2f%% (%d/%d)%s logs: [%s]\n}"
. "{%progress{20}{%replay-pos{}}{%fetch-pos{}}{%fetch-size{}}}"
. "{%work-percent{}}"
. "{%{amount}{ }{0}}"
. "{%{amount}{ }{1}}"
. "{%{amount}{ }{2}}"
. "{%{logs}}"
. "%call{fetch-line}"
. "%call{replay-line}",
"replinfo-1and1"
=> "%let{amount}{%human-numbers{}{ }{ }{%replay-pos{}}{%fetch-size{}}}"
. "%sprintf{ replaying: %s %.2f%% (%d/%d)%s logs: [%d..%d]\n}"
. "{%progress{20}{%replay-pos{}}{%fetch-pos{}}{%fetch-size{}}}"
. "{%work-percent{}}"
. "{%{amount}{ }{0}}"
. "{%{amount}{ }{1}}"
. "{%{amount}{ }{2}}"
. "{%replay-lognr{}}"
. "{%fetch-lognr{}}"
. "%call{fetch-line-1and1}"
. "%call{replay-line-1and1}",
"sync-line"
=> "%let{amount}{%human-numbers{}{}{}{%sync-pos{}}{%sync-size{}}}"
. "%let{rate}{%human-numbers{}{}{}{%sync-rate{}}}"
. "%let{remain}{%human-seconds{%sync-remain{}}}"
. " > sync: %{amount} rate: %{rate}/sec remaining: %{remain} hrs\n",
"sync-line-1and1"
=> "%let{amount}{%human-numbers{}{}{}{%sync-pos{}}{%sync-size{}}}"
. "%let{rate}{%human-numbers{}{}{}{%sync-rate{}}}"
. "%let{remain}{%human-seconds{%sync-remain{}}}"
. " > sync: %{amount} rate: %{rate}/sec remaining: %{remain} hrs\n",
"fetch-line"
=> "%let{amount}{%human-numbers{}{}{}{%fetch-rest{}}}"
. "%let{rate}{%human-numbers{}{}{}{%fetch-rate{}}}"
. "%let{remain}{%human-seconds{%fetch-remain{}}}"
. "%let{age}{%if{%and{%fetch-remain{}}{%>={%fetch-age{}}{%{window}}}}{ age: %human-seconds{%fetch-age{}}}}"
. "%let{lag}{%if{%and{%fetch-remain{}}{%>={%fetch-lag{}}{%{window}}}}{ lag: %human-seconds{%fetch-lag{}}}}"
. " > fetch: %{amount}%{age}%{lag} rate: %{rate}/s remaining: %{remain}\n",
"fetch-line-1and1"
=> "%let{amount}{%human-numbers{}{}{}{%fetch-rest{}}}"
. "%let{rate}{%human-numbers{}{}{}{%fetch-rate{}}}"
. "%let{remain}{%human-seconds{%fetch-remain{}}}"
. " > fetch: %{amount} rate: %{rate}/sec remaining: %{remain} hrs\n",
"replay-line"
=> "%let{amount}{%human-numbers{}{}{}{%replay-rest{}}}"
. "%let{rate}{%human-numbers{}{}{}{%replay-rate{}}}"
. "%let{remain}{%human-seconds{%replay-remain{}}}"
. "%let{age}{%if{%and{%replay-remain{}}{%>={%replay-age{}}{%{window}}}}{ age: %human-seconds{%replay-age{}}}}"
. " > replay: %{amount}%{age} rate: %{rate}/s remaining: %{remain}\n",
"replay-line-1and1"
=> "%let{amount}{%human-numbers{}{}{}{%replay-rest{}}}"
. "%let{rate}{%human-numbers{}{}{}{%replay-rate{}}}"
. "%let{remain}{%human-seconds{%replay-remain{}}}"
. " > replay: %{amount} rate: %{rate}/sec remaining: %{remain} hrs\n",
"resource-errors"
=> "%let{fat-count}{%get-resource-fat-count{}}"
. "%if{%{fat-count}}{"
. "FATALS FILE (%{fat-count})"
. "%if{%{verbose}}{"
. ":\n%get-resource-fat{}"
. "}{"
. ": available with --verbose\n"
. "}"
. "}"
. "%let{errs}{%the-err-count{}}"
. "%if{%{errs}}{"
. "ERRORS LNK (%{errs}):\n"
. "%the-pretty-err-msg{}"
. "}"
. "%let{err-count}{%get-resource-err-count{}}"
. "%if{%{err-count}}{"
. "ERRORS FILE (%{err-count})"
. "%if{%{verbose}}{"
. ":\n%get-resource-err{}"
. "}{"
. ": available with --verbose\n"
. "}"
. "}"
. "%let{wrns}{%the-wrn-count{}}"
. "%if{%{wrns}}{"
. "WARNINGS LNK (%{wrns}):\n"
. "%the-pretty-wrn-msg{}"
. "}"
. "%let{wrn-count}{%get-resource-wrn-count{}}"
. "%if{%and{%{verbose}}{%{wrn-count}}}{"
. "WARNINGS FILE (%{wrn-count})"
. "%if{%{verbose}}{"
. ":\n%get-resource-wrn{}"
. "}{"
. ": available with --verbose\n"
. "}"
. "}"
. "%let{infs}{%the-inf-count{}}"
. "%if{%and{%>={%{verbose}}{2}}{%{infs}}}{"
. "INFOS LNK (%{infs}):\n"
. "%the-pretty-inf-msg{}"
. "}"
. "%let{status_msg}{%get-log-status-{}}"
. "%if{%and{%>={%{verbose}}{2}}{%{status_msg}}}{"
. "STATUS FILE:\n%{status_msg}"
. "}",
"resource-errors-1and1"
=> "%let{fat-count}{%get-resource-fat-count{}}"
. "%if{%{fat-count}}{"
. "FATALS FILE (%{fat-count}):\n"
. "%get-resource-fat{}"
. "}"
. "%let{errs}{%the-err-count{}}"
. "%if{%{errs}}{"
. "ERRORS LNK (%{errs}):\n"
. "%the-pretty-err-msg{}"
. "}"
. "%let{err-count}{%get-resource-err-count{}}"
. "%if{%{err-count}}{"
. "ERRORS FILE (%{err-count}):\n"
. "%get-resource-err{}"
. "}"
. "%let{wrns}{%the-wrn-count{}}"
. "%if{%{wrns}}{"
. "WARNINGS LNK (%{wrns}):\n"
. "%the-pretty-wrn-msg{}"
. "}"
. "%let{wrn-count}{%get-resource-wrn-count{}}"
. "%if{%and{%{verbose}}{%{wrn-count}}}{"
. "WARNINGS FILE (%{wrn-count})"
. "%if{%{verbose}}{"
. ":\n%get-resource-wrn{}"
. "}{"
. ": available with --verbose\n"
. "}"
. "}"
. "%let{infs}{%the-inf-count{}}"
. "%if{%and{%{verbose}}{%{infs}}}{"
. "INFOS LNK (%{infs}):\n"
. "%the-pretty-inf-msg{}"
. "}"
. "%let{status_msg}{%get-log-status-{}}"
. "%if{%and{%{verbose}}{%{status_msg}}}{"
. "STATUS FILE:\n%{status_msg}"
. "}",
"comminfo"
=> "%let{txt}{%the-pretty-global-msg{}}"
. "%let{txt}{%subst{%{txt}}{^.*CONNECTED.*\$\n*}{}{mg}}"
. "%if{%{txt}}{"
. "++++++++++++ Global / Communication Status ++++++++++++\n"
. "%{txt}"
. "}"
. "%let{timeslip}{%-{%time{}}{%real-time{}}}"
. "%if{%>{%{timeslip}}{%{window}}}{"
. "++++++++++++ Lamport time slip: %human-seconds{%{timeslip}} ++++++++++++\n"
. "}",
# drbd similar ones
"state"
=> "NYI Please override macro \\%%{0}\\{\\}",
"cstate"
=> "NYI Please override macro \\%%{0}\\{\\}",
"dstate"
=> "NYI Please override macro \\%%{0}\\{\\}",
"status"
=> "NYI Please override macro \\%%{0}\\{\\}",
);
my %view_macros = %complex_macros;
# add some trivial macros to the command line interface
# FIXME: only at most one argument is allowed for now.
my %trivial_globs =
(
# intended for human use
"{all,the}-{pretty-,}{global-,}{{err,wrn,inf}-,}msg"
=> "",
"{is,todo,nr}-{attach,sync,fetch,replay,primary,secondary}"
=> "",
"is-{split-brain,consistent,emergency,orphan}"
=> "",
"is-{member,guest}"
=> "",
"rest-space"
=> "",
"get-{disk,device}"
=> "",
"present-{disk,device}"
=> "(deprecated, use *-present instead)",
"disk-error"
=> "",
"{disk,device}-present"
=> "",
"device-{opened,nrflying,error,completion-{stamp,age}}"
=> "",
"get-log-status"
=> "",
"get-resource-{fat,err,wrn}{,-count}"
=> "",
# intended for scripting
"{my,all}-{resources,members,guests}"
=> "",
"count-{my,all}-{resources,members,guests}"
=> "",
"{cluster,resource}-members"
=> "deprecated",
"count-{cluster,resource,guest}-members"
=> "deprecated",
"{cluster,resource,guest}-peers"
=> "",
"count-{cluster,resource,guest}-peers"
=> "",
"{disk,resource,device}-size"
=> "",
"resource-possible-size"
=> "",
"deletable-size"
=> "",
"occupied-size"
=> "",
"replay-code"
=> "When negative, this indidates that a replay/recovery error has occurred.",
"errno-text"
=> "Convert errno numbers (positive or negative) into human readable text.",
"{sync,fetch,replay,work,syncpos}-{size,pos}"
=> "",
"{sync,fetch,replay,work}-{rest,{almost-,threshold-,}reached,percent,permille,vector}"
=> "",
"writeback-rest"
=> "",
"{sync,fetch,replay}-{ops-rate,amount-rate,rate,remain}"
=> "",
"device-{ops-rate,amount-rate,rate}"
=> "",
"replay-basenr"
=> "",
"{fetch,replay,work}-{lognr,logcount}"
=> "",
"summary-vector",
=> "",
"{time,real-time}"
=> "",
"{alive,fetch,replay,work}-{timestamp,age,lag}"
=> "",
"{potential,implemented,usable}-features"
=> "",
"{implemented,usable}-{digests,compressions}"
=> "",
"enabled-{log|net}-compressions"
=> "",
"disabled-{log|net}-digests"
=> "",
"used-{log,net}-{digest,compression}"
=> "",
"{tree,features}-version"
=> "",
"{get,actual}-primary"
=> "",
"is-{alive}"
=> "",
"systemd-unit"
=> "",
"uuid"
=> "",
"tree"
=> "",
"wait-{is,todo}-{attach,sync,fetch,replay,primary,secondary}-{on,off}"
=> "",
);
my $glob = "";
foreach my $new_glob (sort alphanum_cmp keys(%trivial_globs)) {
$glob .= "," if $glob;
$glob .= $new_glob;
}
foreach my $name (lamport_glob("{$glob}")) {
$view_macros{$name} = "\%primitive-$name\{\%\{1}}";
}
sub _get_pre {
my ($rest, $add) = @_;
$rest =~ s{(\A.*\n)}{}sp;
$add = 0 if $1;
return length($rest) + $add;
}
sub _break_line {
my ($result, $add, $indent) = @_;
my $pre_len = _get_pre($result, $add);
if ($pre_len != $indent) {
$result .= "\\\n" . ' ' x $indent;
}
return $result;
}
sub _pretty_macro {
my ($text, $add, $indent) = @_;
$text =~ s/\\n/\n/gs;
my $result = "";
# look for function calls
while ($text =~ m/^($match_fn_head(?:{})*)/mp) {
$result .= $PREMATCH;
my $fn = $1;
$text = $POSTMATCH;
$result = _break_line($result, $add, $indent);
$add = 0;
$result .= $fn;
while ($text =~ m/\A\{/sp) {
# don't break simple / non-recursive / unbreakable arguments
if ($text =~ m/\A(\{(?:\s$match_inner|[^%{}]*)\})/sp) {
my $shortcut = $1;
$text = $POSTMATCH;
# make newlines non-verbatim
$shortcut =~ s{\n}{\\n}spg;
$result .= $shortcut;
next;
}
# break more complex arguments
#$result .= "{\\\n" . ' ' x $indent;
$result .= "{";
if ($text =~ m/\A\{([\s]$match_inner|)\}/sp) {
$result .= "$1}";
$text = $POSTMATCH;
} elsif ($text =~ m/\A\{([^\s]$match_inner)\}/sp ) {
my $arg = $1;
$text = $POSTMATCH;
my $sub_add = _get_pre($result, 0);
my $subst = _pretty_macro($arg, $sub_add, $indent + 2);
$result .= $subst;
$result = _break_line($result, 0, $indent);
$result .= "}";
} else {
ldie "wtf '$text'?\n";
}
}
}
$result .= $text;
return $result;
}
sub pretty_macro {
my $txt = _pretty_macro(@_);
# always add a trailing newline (for vi ;)
$txt .= "\\\n" unless $txt =~ m{\n\Z}sp;
return $txt;
}
get_global_versions();
if (defined($ARGV[0]) && $ARGV[0] =~ m/^dump-(all-)?macros$/) {
my %macros = %complex_macros;
%macros = %view_macros if defined($1);
foreach my $name (keys %macros) {
my $txt = $macros{$name};
$txt = pretty_macro($txt, 0, 0);
my $file = "$name.tpl";
if (-r $file) { # backup already existing files
for (my $i = 0; ; $i++) {
my $file_old = "$file.old$i";
unless (-r $file_old) {
rename($file, $file_old);
last;
}
}
}
open(OUT, ">", $file) or ldie "cannot create file '$file'\n";
print OUT $txt;
close(OUT);
}
exit(0);
}
sub get_macro {
my ($cmd, $tolerate) = @_;
if ($macro) {
my $orig = $macro;
$macro = ""; # consume once
return $orig;
}
foreach my $path (@MARS_PATH) {
my $file = "$path/$cmd.tpl";
if (-r $file) {
lprint_stderr "using macro file '$file'\n" if (defined($view_macros{$cmd}));
local $/; # slurp
open(IN, "<", $file) or next;
my $tpl = <IN>;
close(IN);
$tpl =~ s{$match_comment}{}sg;
$tpl =~ s{\\n}{\n}sg;
return $tpl;
}
}
return $view_macros{$cmd} if (defined($view_macros{$cmd}));
ldie "cannot find macro '$cmd'\n" unless defined($tolerate);
return "";
}
sub view_cmd {
my ($cmd, $res) = (shift, shift);
if ($cmd =~ s/^prettyprint-//) {
my $txt = get_macro($cmd);
print pretty_macro($txt, 0);
return;
}
$cmd =~ s/^view-?//;
$cmd = "default" unless $cmd;
my $tpl = get_macro($cmd);
my $result = eval_macro($cmd, $res, $tpl, @_);
if ($result ne "") { # add trailing newline if none exists
chomp $result;
$result .= "\n";
}
print $result;
}
##################################################################
# command table of all commands
my %cmd_table =
(
# new keywords
"create-uuid"
=> [
"verbose 3",
"usage: create-uuid (no parameters)",
"Deprecated.",
"This is only needed if you have a very old $mars/",
"directory structure from MARS version light0.1beta05",
"or earlier.",
\&create_uuid,
],
"create-cluster"
=> [
"usage: create-cluster (no parameters)",
"This must be called exactly once when creating a new cluster.",
"Don't call this again! Use join-cluster on the secondary nodes.",
"Please read the PDF manual for details.",
\&create_cluster,
],
"join-cluster"
=> [
"usage: join-cluster <hostname_of_primary>",
"Establishes a new cluster membership.",
"This must be called once on any new cluster member.",
"This is a prerequisite for join-resource.",
\&join_cluster,
],
"merge-cluster"
=> [
"usage: merge-cluster <hostname_of_other_cluster> [<host_ip>]",
"Precondition: the resource names of both clusters must be disjoint.",
"Create the union of two clusters, consisting of the",
"union of all machines, and the union of all resources.",
"The members of each resource are _not_ changed by this.",
"This is useful for creating a big \"virtual LVM cluster\" where",
"resources can be almost arbitrarily migrated between machines via",
"later join-resource / leave-resource operations.",
\&merge_cluster,
],
"merge-cluster-list"
=> [
"usage: merge-cluster-list",
"Determine the local list of resources.",
"Useful for checking or analysis of merge-cluster disjointness by hand.",
\&merge_cluster,
],
"merge-cluster-check"
=> [
"usage: merge-cluster-check <hostname_of_other_cluster>",
"Check whether the resources of both clusters are disjoint.",
"Useful for checking in advance whether merge-cluster would be",
"possible.",
\&merge_cluster,
],
"split-cluster"
=> [
"usage: split-cluster (no parameters)",
"NOT OFFICIALLY SUPPORTED - ONLY FOR EXPERTS.",
"RTFS = Read The Fucking Sourcecode.",
"Use this only if you know what you are doing.",
\&split_cluster,
],
"leave-cluster"
=> [
"usage: leave-cluster (no parameters)",
"This can be used for final deconstruction of a cluster member.",
"Prior to this, all resources must have been left",
"via leave-resource.",
"Notice: this will never destroy the cluster UID on the /mars/",
"filesystem.",
"Please read the PDF manual for details.",
\&leave_cluster,
],
"create-resource"
=> [
"usage: create-resource <resource_name> </dev/lv/mydata>",
"(further syntax variants are described in the PDF manual).",
"Create a new resource out of a pre-existing disk (backing",
"block device) /dev/lv/mydata (or similar).",
"The current node will start in primary role, thus",
"/dev/mars/<resource_name> will appear after a short time, initially",
"showing the same contents as the underlying disk /dev/lv/mydata.",
"It is good practice to name the resource <resource_name> and the",
"disk name identical.",
\&create_res,
],
"join-resource"
=> [
"usage: join-resource <resource_name> </dev/lv/mydata>",
"(further syntax variants are described in the PDF manual).",
"The resource <resource_name> must have been already created on",
"another cluster node, and the network must be healthy.",
"The contents of the local replica disk /dev/lv/mydata will be",
"overwritten by the initial fast full sync from the currently",
"designated primary node.",
"After the initial full sync has finished, the current host will",
"act in secondary role.",
"For details on size constraints etc, refer to the PDF manual.",
\&create_res,
],
"leave-resource"
=> [
"Precondition: the local host must be in secondary role.",
"Stop being a member of the resource, and thus stop all",
"replication activities. The status of the underlying disk",
"will remain in its current state (whatever it is).",
\&leave_res_phase0,
"check preconditions",
\&leave_res_phase1,
"switch state",
\&leave_res_phase2,
"purge logfiles",
\&leave_res_phase3,
"wait for deletions",
],
"delete-resource"
=> [
"CAUTION! This is dangerous when the network is somehow",
"interrupted, or when damaged nodes are later re-surrected",
"in any way.\n",
"Precondition: the resource must no longer have any members",
"(see leave-resource).",
"This is only needed when you _insist_ on re-using a damaged",
"resource for re-creating a new one with exactly the same",
"old <resource_name>.",
"HINT: best practice is to not use this, but just create a _new_",
"resource with a new <resource_name> out of your local disks.",
"Please read the PDF manual on potential consequences.",
\&delete_res,
],
"set-systemd-unit"
=> [
"usage: set-systemd-unit <resource_name> <start_unit_name> [<stop_unit_name>]",
"This activates the systemd template engine of marsadm.",
"Please read mars-user-manual.pdf on this.",
"When <stop_unit_name> is omitted, it will be treated equal to",
"<start_unit_name>.",
\&set_systemd_unit,
],
"get-systemd-unit"
=> [
"usage: get-systemd-unit <resource_name>",
"Show the system units (for start and stop), or empty when unset.",
\&get_systemd_unit,
],
"set-systemd-want"
=> [
"usage: set-systemd-want <resource_name> <host_name>",
"Override the current location where the complete systemd unit stack",
"should be started.",
"Useful for a _temporary_ stop of the systemd unit stack by supplying",
"the special hostname \"(none)\".",
"For a _permanent_ stop, use \"marsadm set-systemd-unit <resource>\"",
"instead.",
\&set_systemd_want,
],
"get-systemd-want"
=> [
"usage: get-systemd-want <resource_name>",
"Show the current hostname where the complete systemd unit stack",
"between start- and stop-unit should appear.",
"Reports empty when unset, or \"(none)\" when stopped.",
\&get_systemd_want,
],
"set-connect-pref-list"
=> [
"verbose 2",
"usage: set-connect-pref-list <resource_name> <host_list>",
"Provisionary command for internal use at 1&1. Will be replaced by",
"a better concept somewhen. The <host_list> must be comma-separated.",
\&set_connect_pref_list,
],
"get-connect-pref-list"
=> [
"verbose 2",
"Provisionary command for internal use at 1&1. Will be replaced by",
"a better concept somewhen.",
"Shows the outcome of set-connect-pref-list.",
\&set_connect_pref_list,
],
"set-global-enabled-log-compressions"
=> [
"usage: set-global-enabled-log-compressions <features>",
"Tell the whole cluster which compression features to use globally",
"for logfile compression. The effective value can be checked via",
"\"marsadm view-enabled-log-compressions\".",
"See \"marsadm view-potential-features\" and",
"\"marsadm --help\" for a list of compression feature names,",
"which must be separated by | symbols.",
\&set_defaults,
],
"set-global-enabled-net-compressions"
=> [
"usage: set-global-enabled-net-compressions <features>",
"Tell the whole cluster which compression features to use globally for",
"network transport compression. This is independent from log compression.",
"The effective value can be checked via",
"\"marsadm view-enabled-log-compressions\".",
"See \"marsadm view-potential-features\" and",
"\"marsadm --help\" for a list of compression feature names,",
"which must be separated by | symbols.",
\&set_defaults,
],
"set-global-disabled-log-digests"
=> [
"usage: set-global-disabled-log-digests <features>",
"Tell the whole cluster which checksumming digests to disable globally",
"for the payload in transaction logfiles.",
"The effective value can be checked via \"marsadm view-disabled-log-digests\".",
"See \"marsadm view-potential-features\" and",
"\"marsadm --help\" for a list of digest feature names,",
"which must be separated by | symbols.",
\&set_defaults,
],
"set-global-disabled-net-digests"
=> [
"usage: set-global-disabled-net-digests <features>",
"Tell the whole cluster which checksumming digests to disable globally",
"for cluster-wide data comparisons, like fast full-sync.",
"The effective value can be checked via \"marsadm view-disabled-net-digests\".",
"See \"marsadm view-potential-features\" and",
"\"marsadm --help\" for a list of digest feature names,",
"which must be separated by | symbols.",
\&set_defaults,
],
"set-disabled-digests"
=> [
"verbose 99", # NYI
"usage: set-disabled-digests <resource_name> <features>",
"Tell the whole cluster which checksumming digests to disable at the",
"resource <resource_name>, potentially overriding the global value",
"as set by set-global-disabled-digests.",
"The effective value can be checked via \"marsadm view-disabled-digests\".",
"See \"marsadm view-potential-features\" and",
"\"marsadm --help\" for a list of digest feature names,",
"which must be separated by | symbols.",
\&set_defaults,
],
"log-rotate"
=> [
"Only useful at the primary side.",
"Start writing transaction logs into a new transaction logfile.",
"This should be regularly called by a cron job or similar.",
"For regular maintainance cron jobs, please prefer 'marsadm cron'.",
"For details and best practices, please refer to the PDF manual.",
\&logrotate_res,
],
"log-delete-one"
=> [
"When possible, globally delete at most one old transaction logfile",
"which is known to be superfluous, i.e. all secondaries no longer",
"need to replay it.",
"Hint: use this only for testing and manual inspection.",
"For regular maintainance cron jobs, please prefer cron",
"or log-delete-all.",
\&logdelete_res,
],
"log-delete"
=> [
"When possible, globally delete all old transaction logfiles which",
"are known to be superflous, i.e. all secondaries no longer need",
"to replay them.",
"This must be regularly called by a cron job or similar, in order",
"to prevent overflow of the /mars/ directory.",
"For regular maintainance cron jobs, please prefer 'marsadm cron'.",
"For details and best practices, please refer to the PDF manual.",
\&logdelete_res,
],
"log-delete-all"
=> [
"Alias for log-delete",
\&logdelete_res,
],
"cron"
=> [
"usage: cron (no parameters)",
"Do all necessary regular housekeeping tasks.",
"This is equivalent to log-rotate all; sleep 7; log-delete-all all.",
[
\&link_purge_global,
],
\&cron_phase1,
"purge links and rotate logfiles",
"SLEEP",
\&cron_phase2,
"delete old logfiles",
],
"log-purge-all"
=> [
"This is potentially dangerous.",
"Use this only if you are really desperate in trying to resolve a",
"split brain. Use this only after reading the PDF manual!",
\&log_purge_res,
],
"err-purge-all"
=> [
"Remove any err message from the given resources.",
\&err_purge_res,
],
"link-purge-all"
=> [
"Remove any .deleted links.",
[
\&link_purge_global,
],
\&link_purge_res,
],
"fake-sync"
=> [
"verbose 1",
"Attention: this is potentially dangerous.",
"Only for experts.",
"Please read the PDF manual to understand the risks!",
\&fake_sync_phase1,
"switch sync off",
\&fake_sync_phase2,
"wait for sync off",
\&fake_sync_phase3,
"fake sync",
],
"set-link"
=> [
"verbose 1",
"usage: set-link <path> <value>",
"Only for experts.",
\&set_link_cmd,
],
"get-link"
=> [
"verbose 1",
"usage: get-link <path>",
"Only for experts.",
\&set_link_cmd,
],
"set-sync-limit-value"
=> [
"usage: set-sync-limit-value <new_value>",
"Set the maximum number of resources which should by syncing",
"concurrently.",
\&set_sync_limit_value,
],
"get-sync-limit-value"
=> [
"usage: get-sync-limit-value (no parameters)",
"For retrieval of the value set by set-sync-limit-value.",
\&set_sync_limit_value,
],
"delete-file"
=> [
"verbose 1",
"usage: delete-file <path>",
"VERY dangerous!",
"Only for experts.",
\&delete_file_cmd,
],
"set-emergency-limit"
=> [
"usage: set-emergency-limit <resource_name> <value>",
"Set a per-resource emergency limit for disk space in /mars.",
"See PDF manual for details.",
\&emergency_limit_res,
],
"get-emergency-limit"
=> [
"Counterpart of set-emergency-limit (per-resource emergency limit)",
\&emergency_limit_res,
],
"emergency-limit" => \&emergency_limit_res,
"cat"
=> [
"usage: cat <path>",
"Print internal debug output in human readable form.",
"Numerical timestamps and numerical error codes are replaced",
"by more readable means.",
"Example: marsadm cat /mars/5.total.status",
\&cat_cmd,
],
"show"
=> [
"verbose 3",
"Deprecated old low-level tool. Don't use. Use macros instead.",
\&show_cmd,
],
"show-errors"
=> [
"verbose 3",
"Deprecated old low-level tool. Don't use. Use macros instead.",
\&show_errors_cmd,
],
"show-state"
=> [
"verbose 3",
"Deprecated old low-level tool. Don't use. Use macros instead.",
\&mars_state_cmd,
],
"mars-state"
=> [
"verbose 3",
"Deprecated old low-level tool. Don't use. Use macros instead.",
\&mars_state_cmd,
],
"show-info"
=> [
"verbose 3",
"Deprecated old low-level tool. Don't use. Use macros instead.",
\&mars_info_cmd,
],
"mars-info" => \&mars_info_cmd, # deprecated
"pause-replay-local"
=> [
"Stop replaying transaction logfiles for now.",
"This is independent from any {pause,resume}-fetch operations.",
"This may be used for freezing the state of your replica for some",
"time, if you have enough space on /mars/.",
"Only useful on a secondary node.",
\&pause_replay_local_res,
],
"pause-replay-global"
=> [
"Like pause-replay-local, but affects all resource members",
"in the cluster (remotely).",
\&pause_replay_global_res,
],
"pause-replay"
=> [
"See pause-replay-local.",
\&pause_replay_local_res,
],
"resume-replay-local"
=> [
"Restart replaying transaction logfiles, when there is some",
"data left.",
"This is independent from any {pause,resume}-fetch operations.",
"This should be used for unfreezing the state of your local replica.",
"Only useful on a secondary node.",
\&pause_replay_local_res,
],
"resume-replay-global"
=> [
"Like resume-replay-local, but affects all resource members",
"in the cluster (remotely).",
\&pause_replay_global_res,
],
"resume-replay"
=> [
"See resume-replay-local.",
\&pause_replay_local_res,
],
"set-replay"
=> [
"verbose 1",
"VERY dangerous!",
"Only for experts.",
\&set_replay_res,
],
"wait-umount"
=> [
"Wait until /dev/mars/<resource_name> has disappeared in the",
"cluster (even remotely).",
"Useful on both primary and secondary nodes.",
\&wait_umount_res,
],
"wait-cluster"
=> [
"usage: wait-resource [<resource_name>]",
"Waits until a ping-pong communication has succeeded in the",
"whole cluster (or only the members of <resource_name>).",
"NOTICE: this is extremely useful for avoiding races when scripting",
"in a cluster.",
\&wait_cluster,
],
"wait-resource"
=> [
"usage: wait-resource <resource_name>",
" [[attach|fetch|replay|sync][-on|-off]]",
"Wait until the given condition is met on the resource, locally.",
\&wait_cond,
],
"update-cluster"
=> [
"usage: update-cluster [<resource_name>]",
"Fetch all the links from all joined cluster hosts.",
"Use this between create-resource and join-resource.",
"NOTICE: this is extremely useful for avoiding races when scripting",
"in a cluster.",
\&update_cluster,
],
"activate-guest"
=> [
"usage: activate-guest <resource_name>",
"Conditional update-cluster, so that <resource_name> will be locally",
"known at the local machine, and mark the resource as a guest.",
"Useful inbetween create-resource and join-resource.",
"A guest will receive any symlink updates much more frequently.",
"Prefer this over update-cluster when interested in a resource.",
\&activate_guest,
],
"deactivate-guest"
=> [
"usage: deactivate-guest <resource_name>",
"Precondition: the resource must not have local storage assigned.",
"Useful for cleaning up a pure guest relastionship.",
\&activate_guest,
],
# compatible keywords (or their derivatives)
"attach"
=> [
"Attaches the local disk (backing block device) to the resource.",
"The disk must have been previously configured at",
"{create,join}-resource.",
"When designated as a primary, /dev/mars/\$res will also appear.",
"This does not change the state of {fetch,replay}.",
"For a complete local startup of the resource, use 'marsadm up'.",
\&attach_res_phase0,
"check preconditions",
\&attach_res_phase1,
"switch state",
\&attach_res_phase2,
"wait for effect",
],
"detach"
=> [
"Detaches the local disk (backing block device) from the",
"MARS resource.",
"Caution! you may read data from the local disk afterwards,",
"but ensure that no data is written to it!",
"Otherwise, you are likely to produce harmful inconsistencies.",
"When running in primary role, /dev/mars/\$res will also disappear.",
"This does not change the state of {fetch,replay}.",
"For a complete local shutdown of the resource, use 'marsadm down'.",
\&attach_res_phase0,
"check preconditions",
"FORK",
\&attach_res_phase0b,
"wait for systemd device release",
\&attach_res_phase1,
"switch state",
\&attach_res_phase2,
"wait for effect",
],
"resume-fetch-local"
=> [
"Start fetching transaction logfiles from the current",
"designated primary node, if there is one.",
"This is independent from any {pause,resume}-replay operations.",
"Only useful on a secondary node.",
\&fetch_local_res,
],
"resume-fetch-global"
=> [
"Like resume-fetch-local, but affects all resource members",
"in the cluster (remotely).",
\&fetch_global_res,
],
"resume-fetch"
=> [
"See resume-fetch-local.",
\&fetch_local_res,
],
"pause-fetch-local"
=> [
"Stop fetching transaction logfiles from the current",
"designated primary.",
"This is independent from any {pause,resume}-replay operations.",
"Only useful on a secondary node.",
\&fetch_local_res,
],
"pause-fetch-global"
=> [
"Like pause-fetch-local, but affects all resource members",
"in the cluster (remotely).",
\&fetch_global_res,
],
"pause-fetch"
=> [
"See pause-fetch-local.",
\&fetch_local_res,
],
"connect-local"
=> [
"See resume-fetch-local.",
\&fetch_local_res,
],
"connect-global"
=> [
"Like resume-fetch-local, but affects all resource members",
"in the cluster (remotely).",
\&fetch_global_res,
],
"connect"
=> [
"See resume-fetch-local.",
\&fetch_local_res,
],
"disconnect-local"
=> [
"See pause-fetch-local.",
\&fetch_local_res,
],
"disconnect-global"
=> [
"Like pause-fetch-local, but affects all resource members",
"in the cluster (remotely).",
\&fetch_global_res,
],
"disconnect"
=> [
"See pause-fetch-local.",
\&fetch_local_res,
],
"syncer" => \&ignore_cmd,
"up"
=> [
"Shortcut for attach + resume-sync + resume-fetch + resume-replay.",
\&up_res_phase0,
"check preconditions",
\&up_res_phase1,
"switch state",
\&up_res_phase2,
"wait for effect",
],
"down"
=> [
"Shortcut for detach + pause-sync + pause-fetch + pause-replay.",
\&up_res_phase0,
"check preconditions",
"FORK",
\&attach_res_phase0b,
"wait for systemd device release",
\&up_res_phase1,
"switch state",
\&up_res_phase2,
"wait for effect",
],
"primary"
=> [
"Promote the resource into primary role.",
"This is necessary for /dev/mars/\$res to appear on the local host.",
"Notice: by concept there can be only _one_ designated primary",
"in a cluster at the same time.",
"The role change is automatically distributed to the other nodes",
"in the cluster, provided that the network is healthy.",
"The old primary node will _automatically_ go",
"into secondary role first. This is different from DRBD!",
"With MARS, you don't need an intermediate 'secondary' command",
"for switching roles.",
"It is usually better to _directly_ switch the primary roles",
"between both hosts.",
"When --force is not given, a planned handover is started:",
"the local host will only become actually primary _after_ the",
"old primary is gone, and all old transaction logs have been",
"fetched and replayed at the new designated priamry.",
"When --force is given, no handover is attempted. A a consequence,",
"a split brain situation is likely to emerge.",
"Thus, use --force only after an ordinary handover attempt has",
"failed, and when you don't care about the split brain.",
"For more details, please refer to the PDF manual.",
[
\&wait_cluster_noforce,
],
\&primary_phase0,
"check preconditions",
"FORK",
\&primary_phase0a,
"conditionally wait for fetch off",
\&primary_phase0b,
"wait for systemd",
"LOOP",
\&primary_phase1,
"leave primary state",
"LOOP",
\&primary_phase1b,
"trigger remote",
"LOOP",
\&primary_phase2,
"wait for cluster when necessary",
"LOOP",
\&primary_phase2b,
"avoid split brain",
\&primary_phase3,
"switch to primary",
\&primary_phase3b,
"trigger remote",
\&primary_phase4,
"wait for device",
],
"secondary"
=> [
"Promote all cluster members into secondary role, globally.",
"In contrast to DRBD, this is not needed as an intermediate step",
"for planned handover between an old and a new primary node.",
"The only reasonable usage is before the last leave-resource of the",
"last cluster member, immediately before leave-cluster is executed",
"for final deconstruction of the cluster.",
"In all other cases, please prefer 'primary' for direct handover",
"between cluster nodes.",
"Notice: 'secondary' sets the global designated primary node",
"to '(none)' which in turn prevents the execution of 'invalidate'",
"or 'join-resource' or 'resize' anywhere in the cluster.",
"Therefore, don't unnecessarily give 'secondary'!",
\&primary_phase0,
"check preconditions",
"FORK",
\&primary_phase1,
"leave primary state",
\&primary_phase1b,
"trigger remote",
\&primary_phase4,
"wait for effect",
],
"invalidate"
=> [
"Only useful on a secondary node.",
"Forces MARS to consider the local replica disk as being",
"inconsistent, and therefore starting a fast full-sync from",
"the currently designated primary node (which must exist;",
"therefore avoid the 'secondary' command).",
"This is usually needed for resolving emergency mode.",
"When having k=2 replicas, this can be also used for",
"quick-and-simple split-brain resolution.",
"In other cases, or when the split-brain is not resolved by",
"this command, please use the 'leave-resource' / 'join-resource'",
"method as described in the PDF manual (in the right order as",
"described there).",
\&invalidate_res_phase0,
"check preconditions",
\&invalidate_res_phase1,
"switch off everything",
\&invalidate_res_phase2,
"wait for everything switched off",
\&invalidate_res_phase3,
"start deletion of old version links",
\&invalidate_res_phase4,
"wait for effect",
\&invalidate_res_phase5,
"start purging old logfiles",
\&invalidate_res_phase6,
"wait for effect",
\&invalidate_res_phase7,
"set new replaylink",
\&invalidate_res_phase8,
"wait for effect",
\&invalidate_res_phase9,
"switch on everything again",
],
"invalidate-remote" => \&forbidden_cmd,
"resize"
=> [
"Prerequisite: all underlying disks (usually /dev/vg/\$res) must",
"have been already increased, e.g. at the LVM layer (cf. lvresize).",
"Causes MARS to re-examine all sizing constraints on all members of",
"the resource, and increase the global logical size of the resource",
"accordingly.",
"Shrinking is currently not yet implemented.",
"When successful, /dev/mars/\$res at the primary will be increased",
"in size. In addition, all secondaries will start an incremental",
"fast full-sync to get the enlarged parts from the primary.",
\&resize_phase0,
"check preconditions",
\&resize_phase1,
"set new size",
\&resize_phase2,
"wait for change",
],
"check-resize" => \&ignore_cmd,
"create-md" => \&senseless_cmd,
"get-gi" => \&ignore_cmd,
"show-gi" => \&ignore_cmd,
"dump-md" => \&senseless_cmd,
"outdate" => \&ignore_cmd,
"adjust" => \&ignore_cmd,
"wait-connect"
=> [
"usage: wait-connect [<resource_name>]",
"See wait-cluster.",
\&wait_cluster,
],
"role"
=> [
"verbose 3",
"Deprecated.",
"Please use the macro command 'view-role' instead.",
"For even better summary information, use plain 'view'.",
\&role_cmd,
],
"state"
=> [
"verbose 3",
"Deprecated.",
"Please use the macro command 'view-role' instead.",
"For even better summary information, use plain 'view'.",
\&role_cmd,
],
"cstate" => \&nyi_cmd,
"dstate" => \&nyi_cmd,
"status" => \&nyi_cmd,
"dump" => \&senseless_cmd,
"verify" => \&forbidden_cmd,
"pause-sync-local"
=> [
"Pause the initial data sync at current stage.",
"This has only an effect if a sync is actually running (i.e.",
"there is something to be actually synced).",
"Don't pause too long, because the local replica will remain",
"inconsistent during the pause.",
"Use this only for limited reduction of system load.",
"Only useful on a secondary node.",
\&pause_sync_local_res,
],
"pause-sync-global"
=> [
"Like pause-sync-local, but affects all resource members",
"in the cluster (remotely).",
\&pause_sync_global_res,
],
"pause-sync"
=> [
"See pause-sync-local.",
\&pause_sync_local_res,
],
"resume-sync-local"
=> [
"Resume any initial / incremental data sync at the stage where it",
"had been interrupted by pause-sync.",
"Only useful on a secondary node.",
\&pause_sync_local_res,
],
"resume-sync-global"
=> [
"Like resume-sync-local, but affects all resource members",
"in the cluster (remotely).",
\&pause_sync_global_res,
],
"resume-sync"
=> [
"See resume-sync-local.",
\&pause_sync_local_res,
],
"new-current-uuid" => \&senseless_cmd,
"hidden-commands" => \&ignore_cmd,
# lowlevel tools
"lowlevel-ls-host-ips"
=> [
"usage: lowlevel-ls-host-ips",
"List cluster member names and IP addresses.",
\&lowlevel_ls_host_ips,
],
"lowlevel-set-host-ip"
=> [
"usage: lowlevel-set-host-ip <hostname> [<new_ip>]",
"Set IP address <new_ip> for host.",
"When <new_ip> is not given, try to determine the old address",
"from the symlink tree, or from old backups.",
"Often, you want to set a new IP address in place of an old one.",
"Hint: you may also use the --ip-<hostname>=<new_ip> option.",
\&lowlevel_set_host_ip,
],
"lowlevel-delete-host"
=> [
"usage: lowlevel-delete-host <hostname>",
"Delete cluster member.",
\&lowlevel_delete_host,
],
# systemd interface
"systemd-trigger"
=> [
\&systemd_trigger,
],
);
sub helplist {
my $msg = shift;
print "ERROR: $msg" if ($msg);
print "
Thorough documentation is in mars-user-manual.pdf. Please use the PDF manual
as authoritative reference! Here is only a short summary of the most
important sub-commands / options:
marsadm [<global_options>] <command> [<resource_names> | all | <args> ]
marsadm [<global_options>] view[-<macroname>] [<resource_names> | all ]
<global_option> =
--force
Skip safety checks.
Use this only when you really know what you are doing!
Warning! This is dangerous! First try --dry-run.
Not combinable with 'all'.
--ignore-sync
Allow primary handover even when some sync is running somewhere.
This is less rude than --force because it checks for all else
preconditions.
--dry-run
Don't modify the symlink tree, but tell what would be done.
Use this before starting potentially harmful actions such as
'delete-resource'.
--verbose
Increase speakyness of some commands.
--parallel
Only resonable when combined with \"all\".
For each resource, fork() a sub-process running independently
from other resources. May seepd up handover a lot.
However, several cluster managers are known to have problems
with a high parallelism degree (up to deadlocks).
Only use this after thorough testing in combination with your
whole operation stack!
Turns off --singlestep.
--parallel=<number>
Like --parallel, but limit the parallelism degree to the given
number of parallel processes.
Turns off --singlestep.
--singlestep
Debugging aid for multi-phase commands.
Interactively step through the various phases of commands.
Turns off --parallel.
--error-injection-phase=<number>
Only for testing. NEVER use in production.
--delete-method=<code>
EXPERIMENTAL! Only for testing! This option will disappear again!
<code> == 0: Use new deletion method
<code> == 1: Use old deletion method
default is 1 for compatibility.
--logger=/path/to/usr/bin/logger
Use an alternative syslog messenger.
When empty, disable syslogging.
--max-deletions=<number>
When your network or your firewall rules are defective over a
longer time, too many deletion links may accumulate at
/mars/todo-global/delete-* and sibling locations.
This limit is preventing overflow of the filesystem as well
as overloading the worker threads.
--thresh-logfiles=<number>
--thresh-logsize=<number>
Prevention of too many small logfiles when secondaries are not
catching up. When more than thresh-logfiles are already present,
the next one is only created when the last one has at least
size thresh-logsize (in units of GB).
--timeout=<seconds>
Current default: $timeout
Abort safety checks and waiting loops after timeout with an error.
When giving 'all' as resource agument, this works for each
resource independently.
The special value -1 means \"infinite\".
--window=<seconds>
Current default: $window
Treat other cluster nodes as healthy when some communcation has
occured during the given time window.
--keep-backups=<hours>
link-purge-all and cron will delete old backup files and old
symlinks after this number of hours.
Current default: $keep_backups
--threshold=<bytes>
Some macros like 'fetch-threshold-reached' use this for determining
their sloppyness.
--host=<hostname>
Act as if the command was running on cluster node <hostname>.
Warning! This is dangerous! First try --dry-run
--backup-dir=</absolute_path>
Only for experts.
Used by several special commands like merge-cluster, split-cluster
etc for creating backups of important data.
--ip-<peer>=<ip>
Override the IP address of <peer> from the symlink tree, or as determined
from old IP backups, or as determined from the list of network interfaces.
Usually you will need this only at 'create-cluster' or
'join-cluster' / 'merge-cluster' / 'split-cluster' for resolving
ambiguities, or for telling the IP address of yet unknown peers.
It is also useful at 'lowlevel-set-host-ip' for updating an
already existing IP address.
Hint: this option may be given multiple times for different <peer>
parts.
--ip=<ip>
Equivalent to --peer-\$host=<ip>
where \$host is usually the same as \$(hostname), but you may
use --host=<hostname> as an _earlier_ argument for overriding
the default <hostname>.
--ssh-port=<port_nr>
Override the default ssh port (22) for ssh and rsync.
Useful for running {join,merge}-cluster on non-standard ssh ports.
--no-ssh
Equivalent to --ssh-port=0
Disable ssh and rsync completely.
Dead peers / interrupted networks / firewalling may lead to (temporary)
hangs of ssh probes, which are used by default for backwards compatibility.
Hint: ssh_config options like ConnectTimeout may also help.
Use this to disable any probes, and no time loss.
--ssh-opts=\"<ssh_commandline_options>\"
Override the default ssh commandline options. Also used for rsync.
--macro=<text>
Handy for testing short macro evaluations at the command line.
<command> =
";
my $item_count = 0;
foreach my $cmdname (sort alphanum_cmp keys(%cmd_table)) {
my $list = $cmd_table{$cmdname};
next unless ref($list) eq "ARRAY";
my @copy = @$list;
my $line_count = 0;
my $verb_level = 0;
while (my $txt = shift @copy) {
last if ref($txt) eq "CODE";
next if ref($txt) eq "ARRAY";
my $skip_this = 0;
if ($txt =~ m/^verbose *([0-9]+)/) {
$verb_level = $1;
$skip_this = 1;
}
if ($verbose < 0) {
last if $verb_level != -$verbose;
} else {
last if $verb_level > $verbose;
}
next if $skip_this;
if (!$line_count) {
print "\n" if $item_count++;
print " $cmdname\n";
print " usage: $cmdname <resource_name>\n" if $txt !~ m/usage:/;
}
print " $txt\n";
$line_count++;
}
}
print "
<resource_names> = comma-separated list of resource names or \"all\" for all resources
";
print "
<macroname> = <complex_macroname> | <primitive_macroname>
<complex_macroname> =
";
foreach my $macroname (sort alphanum_cmp keys(%complex_macros)) {
print " $macroname\n";
}
print "
<primitive_macroname> =
";
foreach my $glob (sort alphanum_cmp keys(%trivial_globs)) {
my $txt = $trivial_globs{$glob};
print " $glob\n";
print " $txt\n" if $txt;
}
print "
<features> =
";
my $nr = 0;
foreach my $flagname (sort alphanum_cmp keys(%FEATURES_FLAGS)) {
print " |\n" if ($nr++);
print " $flagname";
}
print "\n";
exit 0;
}
my @args;
foreach my $arg (@ARGV) {
if ($arg eq "--force" || $arg eq "-f") {
$force++;
next;
} elsif ($arg eq "--ignore-sync") {
$ignore_sync++;
next;
} elsif ($arg eq "--dry-run" || $arg eq "-d") {
$dry_run++;
next;
} elsif ($arg =~ m/--max-deletions\s*=\s*(-?[0-9]+)/) {
$max_deletions = $1;
next;
} elsif ($arg =~ m/--thresh-logfiles\s*=\s*([0-9]+)/) {
$thresh_logfiles = $1;
next;
} elsif ($arg =~ m/--thresh-logsize\s*=\s*([0-9]+)/) {
$thresh_logsize = $1;
next;
} elsif ($arg =~ m/--parallel(\s*=\s*(-?[0-9]+)?)?/) {
$parallel = defined($2) ? $2 : 0;
$single_step = 0;
next;
} elsif ($arg eq "--singlestep") {
$single_step++;
$parallel = -1;
next;
} elsif ($arg =~ m/--error-injection-phase\s*=\s*([0-9]+)/) {
$inject_phase = $1;
next;
} elsif ($arg =~ m/--delete-method\s*=\s*([0-9]+)/) {
$compat_deletions = $1;
my $compat_path = "$mars/compat-deletions";
my $old = get_link($compat_path, 2);
if (!defined($old) || $old ne $compat_deletions) {
if (-d "/mars" && !$dry_run) {
lprint "Setting '$compat_path' to '$compat_deletions'\n";
set_link($compat_deletions, $compat_path);
finish_links();
} else {
lwarn "Cannot set '$compat_path' to '$compat_deletions'\n";
}
}
next;
} elsif ($arg =~ s/--verbose\s*=\s*(-?[0-9]+)/$1/) {
$verbose = $arg;
next;
} elsif ($arg eq "--verbose" || $arg eq "-v") {
$verbose++;
next;
} elsif ($arg =~ s/--logger\s*=\s*(.*)/$1/) {
$logger = $arg;
next;
} elsif ($arg =~ s/--timeout\s*=\s*([0-9]+)/$1/) {
$timeout = $arg;
next;
} elsif ($arg =~ s/--window\s*=\s*([0-9]+)/$1/) {
$window = $arg;
next;
} elsif ($arg =~ s/--keep-backups\s*=\s*([0-9]+)/$1/) {
$keep_backups = $arg;
next;
} elsif ($arg =~ s/--threshold\s*=\s*([0-9]+)/$1/) {
$threshold = get_size($arg);
next;
} elsif ($arg =~ s/--host\s*=\s*([-_A-Za-z0-9]+)/$1/) {
check_id($arg, 1);
if ($arg ne $host) {
lprint "ATTENTION: acting as if I were host '$arg'\n";
lwarn "some commands require local knowledge not available here.\n";
lwarn "thus something may fail or go wrong - use this at your risk!\n";
$host = $arg;
}
next;
} elsif ($arg =~ m/--backup-dir\s*=\s*(\/[^\s\\:;|<>]+)/) {
$backup_dir = $1;
system("mkdir -p $backup_dir") and ldie "Cannot create backup directory '$backup_dir'\n";
next;
} elsif ($arg =~ m/--ip(-(.*?))?\s*=\s*([0-9.:\[\]]+)/) {
my $peer = $2;
my $ip = $3;
$peer = $host unless $peer;
lprint_stderr "Using IP '$ip' from command line for '$peer'.\n";
$known_ips{$peer} = $ip;
next;
} elsif ($arg =~ m/--no-ssh/) {
$ssh_port = 0;
next;
} elsif ($arg =~ s/--ssh[-_]port\s*=\s*([0-9]+)/$1/) {
$ssh_port = $arg;
lprint_stderr "Using SSH port '$ssh_port' from command line.\n";
next;
} elsif ($arg =~ s/--ssh[-_]opts\s*=\s*(.*)/$1/) {
$ssh_opts = $arg;
lprint_stderr "Using SSH options '$ssh_opts' from command line.\n";
next;
} elsif ($arg =~ s/--macro\s*=\s*(.*)/$1/) {
$macro = $arg;
$macro =~ s/\\n/\n/mg;
next;
} elsif ($arg =~ m/^--help$/ || $arg =~ m/^-h$/) {
helplist;
next;
} elsif ($arg =~ m/^--version$/ || $arg =~ m/^-v$/) {
version;
next;
} elsif ($arg =~ m/^-(.*)/) {
ldie "unrecognized argument '-$1' (bad syntax)\n";
}
if ($arg =~ s/^force-//) {
$force++;
}
push @args, $arg;
}
my $cmd = shift @args || helplist "command argument is missing\n";
if ($cmd =~ m/^help$/ || $cmd =~ m/^h$/) {
helplist;
}
if ($cmd =~ m/^version$/ || $cmd =~ m/^v$/) {
version;
}
ldie "only root may use this tool\n" if $< != 0 && $cmd !~ m/^(cat|view.*|pretty.*)$/; # getpid() seems to be missing in perlfunc
helplist "unknown command $cmd\n" if (!exists $cmd_table{$cmd} && !$cmd =~ m/view/);
# setup syslogging
if ($cmd !~ m/^(version$|v$|view)/ && -x $logger) {
$notify = "(cmd: $cmd)";
my $print_id = $Id;
$print_id =~ s/\$|Id:| //g;
$print_id = substr($print_id, 0, 8);
llog "$print_id $host $0 @ARGV\n";
}
# checks
ldie "The $mars directory does not exist.\n" unless -d $mars;
my @mars_stat = stat($mars) or ldie "Cannot stat '$mars'\n";
# check uid
if ($mars_stat[4]) {
lwarn "Directory $mars has wrong owner uid $mars_stat[4].\n";
chown(0, 0, $mars) or ldie "Cannot chown $mars.\n";
lwarn "Fixed by setting to 0.\n";
}
# check permissions
my $perms = ($mars_stat[2] & 07777);
if ($perms != 0700) {
my $txt = sprintf("Directory $mars has wrong permissions 0%o.\n", $perms);
lwarn($txt);
chmod(0700, $mars) or ldie "Cannot chmod $mars.\n";
lwarn "Fixed to mode 0700 for security reasons.\n";
}
# lowlevel tools
if ($cmd =~ m/^lowlevel-/) {
my $func = $cmd_table{$cmd};
if ($func && ref($func) eq "ARRAY") {
lwarn "EXPERTS ONLY -- risky lowlevel command '$cmd'\n";
my @list = @$func;
while (@list) {
my $memb_func = shift @list;
next unless ref($memb_func) eq "CODE";
&{$memb_func}($cmd, @args);
}
finish_links();
exit(0);
} else {
ldie "Internal error: command table is wrong for '$cmd'";
}
}
if ($cmd !~ m/(create|join)-cluster|create-uuid|systemd|cat|view|pretty/) {
my $uuid = get_link("$mars/uuid", 1);
ldie "No valid cluster UUID $mars/uuid found. You need {create,join}-cluster first (or create-uuid).\n" unless $uuid;
}
my $res = "";
if ($cmd =~ "show|cron") {
$res = shift @args || "all";
} elsif ($cmd =~ m/^(view|pretty)/) {
$res = shift @args || "";
} elsif ($cmd =~ m/^set-.*-list$/) {
$res = shift @args || helplist "comma-separated list argument is missing\n";
} elsif ($cmd =~ m/^set-.*-value$/) {
$res = shift @args || helplist "numeric argument is missing\n";
ldie "argument '$res' isn't numeric\n" unless $res =~ m/^[0-9.]+$/;
} elsif ($cmd =~ m/^set-global-/) {
$res = "";
} elsif ($cmd =~ m/^set-/) {
$res = shift @args || helplist "resource argument is missing\n";
check_id_list($res);
} elsif ($cmd =~ m/^(join|merge)-cluster$/) {
$res = shift @args || helplist "peer argument is missing\n";
} elsif (!($cmd =~ m/^(create|split|leave|wait|update)-cluster|merge-cluster-list|create-uuid|cat|[a-z]+-file|trigger/)) {
$res = shift @args || helplist "resource argument is missing\n";
check_id_list($res);
}
lwarn "Using FORCE option -- hopefully you know what you are doing!\n" if $force;
my %checked_res;
sub do_one_res {
my $func = shift;
my ($cmd, $res) = @_;
if ($inject_phase && $phase_nr == $inject_phase) {
ldie "ERROR INJECTION cmd='$cmd' res='$res' phase='$phase_nr'\n";
}
if ($cmd =~ m/^cat|^set-global-|-file$|-list$|-link$|-value$/) { # no resource argument
} elsif (!$checked_res{"$cmd$res"}) {
$res = check_res($res) unless (!$res || $cmd =~ m/^(join|create|merge|leave|wait)-cluster|(create|join)-resource|show/);
check_res_member($cmd, $res) unless (!$res || $cmd =~ m/^(join|create|delete)-(cluster|resource)|^(merge|leave|wait)-cluster|activate-guest|-purge-|^show|^view/);
detect_splitbrain($res, 1);
$checked_res{"$cmd$res"} = 1;
}
call_hook(!$force, "pre", @_);
my $status = &{$func}(@_);
call_hook(!$force, "post", @_);
return $status;
}
sub expand_res_list {
my ($cmd, $res) = @_;
my @res_list=();
if ($res eq "all" && $cmd !~ m/show|cat|cluster|set-link|delete-file/) {
@res_list = get_any_resources($host);
} elsif ($res =~ m/,/) {
@res_list = split(",", $res);
}
return sort alphanum_cmp @res_list;
}
sub do_all_res {
my $func = shift;
my $do_abort = shift;
my $cmd = shift;
my $res = shift;
my @res_list = expand_res_list($cmd, $res);
if (@res_list) {
ldie "Cannot combine command '$cmd' with 'all' existing resources - you must explicitly name a single new resource\n" if $cmd =~ m/create|join/;
my $any_success = 0;
my $any_fail = 0;
my $any_member = 0;
call_hook(!$force, "all-pre", $cmd, "all", @_) if $do_abort;
foreach $res (@res_list) {
$any_member++;
$res =~ s/^.*\/resource-(.*)$/$1/;
next if defined($skip_res{$res});
if ($verbose || $cmd !~ m/^log-/) {
my $tpl = get_macro("default-resource");
my $hint = eval_macro($cmd, $res, $tpl, @_);
my $type = "guest";
$type = "resource" if is_member($res, $host);
lprint "--------- $type $hint\n";
}
if (!$do_abort) {
# LOOP RETRY mode
# Retry when exit code is not 0
# catch internal ldie() via eval{} failure
my $status = 1;
my $has_died = 0;
eval {
$status = do_one_res($func, $cmd, $res, @_);
1;
} or
# eval{} has failed
(
$has_died = 1
);
$any_fail++ if $status;
# Only ldie() implies future skipping of this resource.
if ($has_died && !$skip_res{$res}) {
$any_fail++;
$skip_res{$res} = 1;
fail_action($cmd, $res);
}
next;
}
# NO LOOP RETRY
# Any non-zero exit code means to skip this resource in future.
# catch internal ldie() via eval{} failure
my $has_died = 0;
eval {
do_one_res($func, $cmd, $res, @_);
1;
} and
# eval{} has succeeded
$any_success = 1 or
# eval{} has failed
(
$has_died = 1
);
if ($has_died && !$skip_res{$res}) {
$skip_res{$res} = 1;
fail_action($cmd, $res);
}
}
return $any_fail unless $do_abort;
if (!$any_success) {
if (!$any_member) {
lprint "I am not member/guest of any resource\n";
return 1;
}
ldie "all resources have errors\n";
}
call_hook(!$force, "all-post", $cmd, "all", @_);
return !$any_success;
} elsif ($res eq "all") {
lwarn "resource qualifier 'all' does not match any resource or guest names\n";
return 0;
} elsif (!$do_abort) {
return do_one_res($func, $cmd, $res, @_);
} else {
my $has_died = 0;
my $status = 0;
eval {
$status = do_one_res($func, $cmd, $res, @_);
1;
} or
# eval{} has failed
(
$has_died = 1
);
if ($has_died && !$skip_res{$res}) {
fail_action($cmd, $res);
$status = 1;
}
return $status;
}
}
if ($cmd =~ m/^(view|pretty)/) {
lwarn "mars kernel module is not loaded\n" unless is_module_loaded();
if ($res eq "all" && $cmd =~ m/^view-?(.*)/) {
my $global_macro_name = $1 ? "$1-header" : "default-header";
my $global_macro = get_macro($global_macro_name, 1);
view_cmd($global_macro_name, "", @args) if $global_macro;
}
do_all_res(\&view_cmd, 1, $cmd, $res, @args);
if ($res eq "all" && $cmd =~ m/^view-?(.*)/) {
my $prefix = $1;
foreach my $suffix ("global", "footer") {
my $global_macro_name = $prefix ? "${prefix}-$suffix" : "default-$suffix";
my $global_macro = get_macro($global_macro_name, 1);
view_cmd($global_macro_name, "", @args) if $global_macro;
}
}
finish_links();
# report any dead peers
get_alive_links("all", "alive", "*", 1, 1);
exit($error_count);
}
my $func = $cmd_table{$cmd};
ldie "unknown command '$cmd'\n" unless $func;
my %kid_res;
sub wait_pid_list {
my @pid_list = @_;
foreach my $pid (@pid_list) {
my $check_pid = waitpid($pid, 0);
my $status = $?;
my $sub_res = $kid_res{$pid};
if ($status > 0) {
lwarn "RESOURCE $sub_res CHILD $pid terminated with status=$status\n";
$error_count++ if $status;
} elsif ($check_pid == $pid) {
lprint_stderr "RESOURCE $sub_res CHILD $pid terminated successfully\n";
} else {
lwarn "RESOURCE $sub_res CHILD $pid terminated with unknown state\n";
}
}
}
sub wait_any_pid {
my ($flags) = @_;
lprint_stderr "WAITING for termination of a child...\n";
my $pid = waitpid(-1, $flags);
my $status = $?;
if ($pid > 0) {
my $sub_res = $kid_res{$pid};
if ($status != 0) {
lwarn "RESOURCE $sub_res CHILD $pid terminated with status=$status\n";
$error_count++ if $status;
} else {
lprint_stderr "RESOURCE $sub_res CHILD $pid terminated successfully\n";
}
}
return $pid;
}
sub do_fork {
my @res_list = expand_res_list($cmd, $res);
my $child_count = 0;
# only fork() when beneficial
if (@res_list && scalar(@res_list) > 1) {
lprint_stderr "FORKING (error_count=$error_count)\n";
$error_count = 0;
foreach my $child_res (@res_list) {
# when necessary, limit the parallelism degree
if ($parallel > 0 && $child_count >= $parallel) {
my $done_pid = wait_any_pid(0);
if ($done_pid > 0) {
$child_count--;
delete $kid_res{$done_pid};
}
}
my $pid = fork();
ldie "Cannot fork()\n" unless defined($pid);
if ($pid) {
lprint_stderr "RESOURCE $child_res starting CHILD $pid\n";
$child_count++;
$kid_res{$pid} = $child_res;
} else {
# child: simply continue with new $res
$res = $child_res;
$child_count = 0;
$child_prefix = "CHILD $child_res: ";
if ($skip_res{$res}) {
ldie "SKIPPING\n";
}
lprint_stderr "STARTING\n";
last;
}
}
}
if ($child_count) {
my @wait_list = sort alphanum_cmp keys(%kid_res);
wait_pid_list(@wait_list);
lprint_stderr "EXIT $error_count\n";
exit($error_count);
}
}
if (ref($func) eq "ARRAY") {
my @list = @$func;
while (@list) {
my $do_loop = 0;
my $memb_func = shift @list;
# check whether fork() is possible
if ($memb_func) {
if ($memb_func eq "FORK") {
$memb_func = shift @list;
# check whether fork() is requested
if ($parallel >= 0) {
do_fork();
}
}
# check whether busy-waiting loop is requested
if ($memb_func eq "LOOP") {
$memb_func = shift @list;
$do_loop++;
} elsif ($memb_func eq "SLEEP") {
$memb_func = shift @list;
sleep(7);
}
}
# nested arrays may be used for _global_ workers
if (ref($memb_func) eq "ARRAY") {
my @sub_list = @$memb_func;
foreach my $sub_func (@sub_list) {
next unless ref($sub_func) eq "CODE";
&{$sub_func}($cmd, $res, @args);
}
}
next unless ref($memb_func) eq "CODE";
my $headline = shift @list;
$phase_nr++;
lprint "-------- PHASE $phase_nr -------- $headline:\n" if defined($headline);
$headline = "" unless defined($headline);
if ($single_step) {
ldie "DEBUGGING: standard input is no tty\n" unless -t STDIN;
lprint "DEBUGGING: type RETURN for starting phase $phase_nr: ";
my $dummy = <STDIN>;
lprint "DEBUGGING: continue phase $phase_nr: $headline\n";
}
my $start_time = mars_time();
my $do_abort = !$do_loop;
my $status;
my $count = 0;
for (;;) {
$status = do_all_res($memb_func, $do_abort, $cmd, $res, @args);
last if !$do_loop;
last if (!defined($status) || !$status);
# we have a busy wait condition
$count++;
lprint "--- status='$status' check again LOOP $count\n" if $verbose;
sleep(1);
my $now = mars_time();
if ($now - $start_time > $timeout) {
lwarn "Condition '$headline' for resources '$res' not reached withing $timeout s\n";
last;
}
}
finish_links();
if (defined($status) && $status) {
lprint_stderr "STATUS='$status'\n" if $verbose;
last;
}
}
} elsif (ref($func) eq "CODE") {
do_all_res($func, 1, $cmd, $res, @args);
} else {
ldie "internal error: command table is wrong for '$cmd'";
}
finish_links();
# do not disturb stdout of informational commands
lprint_stderr "EXIT $error_count\n" if $verbose;
exit($error_count);