1
0
mirror of https://github.com/schoebel/mars synced 2025-01-17 20:54:17 +00:00
mars/userspace/marsadm
Thomas Schoebel-Theuer 3ba04911c2 marsadm: bump version
2020-04-11 08:16:51 +02:00

7770 lines
241 KiB
Perl
Executable File

#!/usr/bin/perl -w
#
# MARS Long Distance Replication Software
#
# This file is part of MARS project: http://schoebel.github.io/mars/
#
# Copyright (C) 2010-2014 Thomas Schoebel-Theuer
# Copyright (C) 2011-2014 1&1 Internet AG
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License along
# with this program; if not, write to the Free Software Foundation, Inc.,
# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
use strict;
use English;
use warnings;
umask 0077;
##################################################################
# global defaults
my $parallel = -1;
my $compat_deletions = 1;
my $threshold = 10 * 1024 * 1024;
my $window = 60;
my $verbose = 0;
my $max_deletions = 512;
my $thresh_logfiles = 10;
my $thresh_logsize = 5; # GB
my $dry_run = 0;
my @MARS_PATH = $ENV{MARS_PATH} ?
split(/:/, $ENV{MARS_PATH}) :
(
".",
defined($ENV{HOME}) ? "$ENV{HOME}/.marsadm" : "",
"/etc/marsadm",
"/usr/lib/marsadm",
"/usr/local/lib/marsadm",
);
my $marsadm_var_dir = defined($ENV{MARSADM_VRA_DIR}) ?
$ENV{MARSADM_VAR_DIR} :
"/var/marsadm";
##################################################################
# messaging
my $error_count = 0;
my $notify = "";
my $child_prefix = "";
my $logger = "/usr/bin/logger";
sub llog {
my ($text) = @_;
if ($notify) {
$text =~ s/"/\\"/g;
system("$logger -t marsadm \"$notify $text\"");
}
}
sub lprint {
my ($text) = @_;
$OUTPUT_AUTOFLUSH = 1;
print $child_prefix . $text;
llog($text);
}
sub lprint_stderr {
my ($text) = @_;
$OUTPUT_AUTOFLUSH = 1;
print STDERR $child_prefix . $text;
llog($text);
}
sub ldie {
my ($text) = @_;
if ($verbose > 2) {
my $i = 1;
for (;;) {
my ($package, $filename, $line, $subroutine, $hasargs, $wantarray, $evaltext, $is_require, $hints, $bitmask, $hinthash) = caller($i++);
last unless defined($subroutine);
lprint_stderr "$line:$subroutine\n";
}
}
$error_count++;
lprint_stderr "DYING: $text";
llog("DYING: $text");
die "\n";
}
sub lwarn {
my ($text) = @_;
lprint_stderr "WARNING: $text";
llog("WARNING: $text");
}
##################################################################
# basic helpers
sub lamport_glob {
my $expr = shift;
my @result =();
my @raw_glob = glob($expr);
foreach my $path (@raw_glob) {
my $val = readlink($path);
next if (defined($val) && $val eq ".deleted");
push @result, $path;
}
return @result;
}
sub link_exists {
my $path = shift;
return 0 unless -l $path;
my $val = readlink($path);
return 0 unless defined($val);
return 0 if $val eq ".deleted";
return 1;
}
sub file_exists {
my $path = shift;
return 1 if -f $path;
return 0;
}
sub any_exists {
my $path = shift;
return 1 if file_exists($path);
return 1 if link_exists($path);
return 0;
}
##################################################################
# global variables
my $Id = '$Id$ ';
my $user_version = 0.1;
my $marsadm_version = 2.6; # some rough hint at newer features
my $mars = "/mars";
my $host = `uname -n` or ldie "cannot determine my network node name\n";
chomp $host;
check_id($host);
my $real_host = $host;
my $backup_dir = "$mars/backups-" . time();
my $force = 0;
my $ignore_sync = 0;
my $cron_mode = 0;
my $timeout = 600;
my $ip = "";
my $ssh_port = 22;
my $ssh_opts = "-A -o StrictHostKeyChecking=no -o ConnectTimeout=5";
my $ssh_probe = "uname -a";
my $rsync_opts = "-av --exclude='.deleted-*'";
my $kernel_version = 0;
my $kernel_features_version = 0;
##################################################################
# ssh helpers
my %ssh_ips;
my %ssh_ports;
sub make_ssh_cmd {
my ($peer, $no_login) = @_;
my $ssh = "ssh $ssh_opts";
if (!defined($ssh_ports{$peer})) {
my $port;
my $real_peer;
my $peer_ip = get_link("$mars/ips/ip-$peer", 2);
# workaround firewall hell at some installations...
RETRY:
for my $this_peer ($peer, $peer_ip) {
next unless defined($this_peer);
next unless $this_peer;
# check whether machine is reachable
if (system("ping -c1 $this_peer")) {
lwarn "cannot ping '$this_peer'\n";
next;
}
# first try given parameters, then port 22, then ssh_config defaults
my %seen = ();
foreach my $try_port ($ssh_port, 22, 0) {
next if defined($seen{$try_port});
$seen{$try_port} = 1;
if ($try_port && system("if which nc; then nc -v -4 -w 5 \"$this_peer\" $try_port < /dev/null > /dev/null; fi")) {
lwarn "SSH port '$try_port' for '$this_peer' is not reachable\n";
next;
}
my $opt_port = $try_port ? "-p $try_port" : "";
if (!system("$ssh $opt_port root\@$this_peer \"$ssh_probe\"")) {
($real_peer, $port) = ($this_peer, $try_port);
last RETRY;
}
lwarn "SSH to '$this_peer' does not work on port '$try_port'\n";
}
}
ldie "SSH to '$peer' does not work\n" unless defined($port);
$ssh_ips{$peer} = $real_peer;
$ssh_ports{$peer} = $port;
}
my $real_peer = $ssh_ips{$peer};
$real_peer = $peer if !defined($real_peer);
my $port = $ssh_ports{$peer};
$ssh .= " -p $port" if $port > 0;
return ($real_peer, $port, $ssh) if defined($no_login);
return "$ssh root\@$real_peer"
}
sub ssh_cmd {
my ($peer, $cmd, $no_fail) = @_;
if ($peer ne $real_host) {
$cmd =~ s/'/\'/g;
$cmd = make_ssh_cmd($peer) . " '$cmd'";
} else {
$cmd =~ s/'/\'/g;
$cmd = "bash -c '$cmd'";
}
ldie "empty command\n" unless $cmd;
my $status = system($cmd);
if ($status and !defined($no_fail)) {
ldie "SSH to '$peer' command '$cmd' failed\n";
}
return $status;
}
sub rsync_cmd {
my ($peer, $cmd, $no_fail) = @_;
my ($real_peer, $port, $ssh) = make_ssh_cmd($peer, 1);
$cmd =~ s/(?<![-a-zA-Z0-9_])$peer(?![-a-zA-Z0-9_])/$real_peer/g if (defined($real_peer) && $real_peer && $real_peer ne $peer);
my $rsync_cmd = "rsync -e '$ssh' $rsync_opts $cmd";
if ($dry_run) {
lprint "DRY_RUN: would execute the following command:\n$rsync_cmd\n";
$rsync_cmd .= " --dry-run";
}
finish_links();
# Repeat the action because some symlinks might be updated (or even vanish)
# when the peer is actively running. There seems to exist no rsync option
# for ignoring all of these errors.
my $cycle = 0;
my $status = 0;
do {
$status = system($rsync_cmd);
} while ($status != 0 && $status != 24 && $cycle++ < 3);
ldie "Cannot get remote symlink tree via rsync, status=$status\n" if (!defined($no_fail) && $status && $status != 24);
}
##################################################################
# external hook scripts
sub _try_exec {
my ($path, $cmd, $args, $abort) = @_;
return unless -x $path;
lprint "RUNNING HOOK $path $cmd $args\n" if $verbose;
unless ($dry_run) {
my $status = system("$path $cmd $args");
lprint "HOOK status=$status\n" if $verbose;
if ($status) {
lwarn "HOOK $path $cmd $args\nFAILURE $status\n";
ldie if $abort;
}
}
}
sub _try_exec_dir {
my ($subdir, $cmd, $args, $abort) = @_;
return unless -d $subdir;
foreach my $sub_path (lamport_glob("$subdir/*")) {
next if $sub_path =~ m:~$:;
_try_exec($sub_path, $cmd, $args, $abort);
}
}
sub call_hook {
my $abort = shift;
my $type = shift;
my $cmd = shift;
my $args = join(" ", @_);
my $pattern = "hook-$type";
foreach my $dir (@MARS_PATH) {
-d $dir || next;
_try_exec("$dir/$pattern-$cmd", $cmd, $args, $abort);
_try_exec("$dir/$pattern", $cmd, $args, $abort);
_try_exec_dir("$dir/$pattern-$cmd.d", $cmd, $args, $abort);
_try_exec_dir("$dir/$pattern.d", $cmd, $args, $abort);
}
}
##################################################################
# syntactic scanning / parsing
my $match_comment = qr'#[^\n]*|//\h[^\n]*|/\*(?:[^*]|\*[^/])*\*/';
my $match_nobrace = qr'(?:[^{}\\]|\\.)*'s;
my $match_inner = $match_nobrace;
my $match_brace = qr"\{$match_inner\}"s;
for (my $i = 0; $i < 20; $i++) {
$match_inner = qr"$match_nobrace(?:$match_brace$match_nobrace)*"s;
$match_brace = qr"\{$match_inner\}"s;
}
my $match_fn_head = qr"\%([^\\\s{}()\[\]]*)(?=\{)"s;
my $match_fn = qr"$match_fn_head(?:\{($match_inner)\})"s;
##################################################################
# resource lists
my %all_resources;
my %member_resources;
sub _scan_resources {
foreach my $res (lamport_glob("$mars/resource-*/{data,replay,systemd}-*")) {
next unless $res =~ s:^$mars/resource-(.*?)/.*:$1:;
$all_resources{$res} = 1;
if (lamport_glob("$mars/resource-$res/{data,replay,systemd}-$host")) {
$member_resources{$res} = 1;
}
}
lprint "====== found " .
scalar(keys(%all_resources)) . " total and " .
scalar(keys(%member_resources)) . " participating " .
"resources for '$host'\n" if $verbose;
}
sub alphanum_cmp {
my ($aa, $bb) = ($a, $b);
$aa =~ s/([0-9]+)/sprintf("%012d",$1)/eg;
$bb =~ s/([0-9]+)/sprintf("%012d",$1)/eg;
return $aa cmp $bb;
}
#print sort alphanum_cmp ("z", "a3", "a21");
#exit 0;
sub get_all_resources {
_scan_resources() unless %all_resources;
return sort alphanum_cmp keys(%all_resources);
}
sub get_member_resources {
_scan_resources() unless %all_resources;
return sort alphanum_cmp keys(%member_resources);
}
##################################################################
# dynamic systemd control
my $systemd_subdir = defined($ENV{SYSTEMD_SUBDIR}) ? $ENV{SYSTEMD_SUBDIR} : "systemd-templates";
my $systemd_system_dirs = defined($ENV{SYSTEMD_SYSTEM_DIRS}) ?
$ENV{SYSTEMD_SYSTEM_DIRS} :
"/etc/systemd/system,/run/systemd/system,/usr/lib/systemd/system";
my $systemd_target_dir = defined($ENV{SYSTEMD_TARGET_DIR}) ? $ENV{SYSTEMD_TARGET_DIR} : "/run/systemd/system";
my $systemd_var_dir = defined($ENV{SYSTEMD_VAR_DIR}) ?
$ENV{SYSTEMD_VAR_DIR} :
"$marsadm_var_dir/systemd";
my $systemd_suffixes = defined($ENV{SYSTEMD_SUFFIXES}) ?
$ENV{SYSTEMD_SUFFIXES} :
"service,socket,device,mount,automount,swap,target,path,timer,slice,scope";
my $systemctl = defined($ENV{SYSTEMCTL}) ? $ENV{SYSTEMCTL} : "systemctl";
my $systemd_escape = defined($ENV{SYSTEMD_ESCAPE}) ? $ENV{SYSTEMD_ESCAPE} : "@";
my $systemd_incape = defined($ENV{SYSTEMD_INCAPE}) ? $ENV{SYSTEMD_INCAPE} : "\\^";
my $systemd_dependencies = defined($ENV{SYSTEMD_DEPENDENCIES}) ?
$ENV{SYSTEMD_DEPENDENCIES} :
"Requires|Requisite|Wants|BindsTo|PartOf|Conflicts|Before|After|OnFailure|PropagatesReloadTo|ReloadPropagatedFrom|JoinsNamespaceOf|RequiresMountsFor|Alias|WantedBy|RequiredBy|Also|DefaultInstance";
my $systemd_lock_file = defined($ENV{SYSTEMD_LOCK_FILE}) ? $ENV{SYSTEMD_LOCK_FILE} : "/tmp/systemd.lock";
my @systemctl_start =
(
"mars-trigger.path", # This MUST come first
"mars-emergency.path",
);
my @systemctl_enable =
(
@systemctl_start,
"mars-trigger.service",
"mars-emergency.service",
);
my %recursive_locks;
sub systemd_lock {
my ($suffix, $try_lock) = @_;
my $lock_file = $systemd_lock_file;
$lock_file .= "." . $suffix if defined($suffix) && $suffix;
my $lock_status = $recursive_locks{$lock_file}++;
if ($lock_status) {
return 0;
}
lprint "TRYING '$lock_file'\n" if $verbose > 1;
use IO::Handle;
use Fcntl;
my $max_time = $timeout > 0 ? $timeout : 30;
my $count = 0;
my $retry = 0;
my $fh;
for (;;) {
my $test_pid;
if (open(my $IN, "<", $lock_file)) {
$test_pid = <$IN>;
chomp $test_pid;
my ($dev,$ino,$mode,$nlink,$uid,$gid,$rdev,$size,
$atime,$mtime,$ctime,$blksize,$blocks) = stat($IN);
close($IN);
# Check for timeout
if ($count > $max_time ||
(defined($mtime) && $mtime && $mtime + $max_time < time())) {
lwarn "breaking lock $lock_file after $max_time seconds\n";
unlink($lock_file);
$count = 0;
}
}
$fh = undef;
my $status = sysopen($fh, $lock_file, O_CREAT|O_EXCL|O_TRUNC|O_WRONLY);
last if defined($status) && $status;
# Check whether pid exists
if (defined($test_pid) && $test_pid && ! -d "/proc/$test_pid") {
next if !$retry++;
lwarn "breaking lock $lock_file, pid $test_pid is no longer alive.\n";
unlink($lock_file);
$count = 0;
$retry = 0;
next;
}
if (defined($try_lock) && $try_lock && !$force) {
lprint "FAILED '$lock_file'\n" if $verbose > 1;
return 1;
}
$count++;
sleep(1);
}
print $fh "$$\n";
close($fh);
lprint "LOCK '$lock_file'\n" if $verbose > 1;
return 0;
}
sub systemd_unlock {
my ($suffix) = @_;
my $lock_file = $systemd_lock_file;
$lock_file .= "." . $suffix if defined($suffix) && $suffix;
if (--$recursive_locks{$lock_file} > 0) {
return;
}
unlink($lock_file);
lprint "UNLOCK '$lock_file'\n" if $verbose > 1;
}
my %template_names;
my %template_files;
sub get_template_files {
if (!%template_files) {
foreach my $dir (@MARS_PATH) {
my $subdir = "$dir/$systemd_subdir";
$subdir = $dir unless -d $subdir;
next unless -d $subdir;
foreach my $template_file (lamport_glob("$subdir/*.{$systemd_suffixes}")) {
my $template_name = `basename '$template_file'`;
chomp $template_name;
next unless $template_name;
# Only the first hit will win when the same template is in multiple dirs.
next if defined($template_files{$template_name});
$template_names{$template_file} = $template_name;
$template_files{$template_name} = $template_file;
lprint "== found template '$template_name' at '$template_file'\n" if $verbose;
}
}
}
return sort alphanum_cmp keys(%template_names);
}
sub get_instance_files {
my ($dir) = @_;
my $glob = "$dir/*.{$systemd_suffixes}";
my %instance_files;
foreach my $instance_file (lamport_glob($glob)) {
my $instance_name = `basename '$instance_file'`;
chomp $instance_name;
$instance_files{$instance_name} = $instance_file;
}
return %instance_files;
}
my %systemd_names;
my %systemd_files;
sub get_systemd_files {
if (!%systemd_names) {
foreach my $systemd_file (lamport_glob("{$systemd_system_dirs}/*.{$systemd_suffixes}")) {
next if $systemd_file =~ m:$systemd_target_dir:;
my $systemd_name = `basename '$systemd_file'`;
chomp $systemd_name;
$systemd_names{$systemd_file} = $systemd_name;
$systemd_files{$systemd_name} = $systemd_file;
}
}
return sort alphanum_cmp keys(%systemd_names);
}
sub systemctl {
my ($args) = @_;
my $cmd = "$systemctl $args";
systemd_lock();
lprint "executing: '$cmd'\n" if $verbose > 1;
my $status;
eval {
$status = system($cmd);
};
systemd_unlock();
return $status;
}
my %failed;
sub _systemd_escape {
my ($txt) = @_;
my $replac = `systemd-escape --path "$txt"`;
chomp $replac;
return $replac;
}
sub subst_systemd_vars {
my ($env, $text, $do_extra_escape) = @_;
my $parsed = "";
while ($text =~ m/[$systemd_escape]([A-Za-z_][-A-Za-z0-9_]*)?[{]($match_inner)[}]/ps) {
my $name = $1 || "";
my $body = $2;
$parsed .= $PREMATCH;
my $rest = $POSTMATCH;
my $this_escape = 0;
my $replac;
$_ = $name;
PRE_SWITCH: {
if (/^escvar$/) {
$name = "";
$this_escape = 1;
last PRE_SWITCH;
}
if (/^esc$/) {
$name = "verbatim";
$this_escape = 1;
last PRE_SWITCH;
}
}
$_ = $name;
SWITCH: {
if (/^eval$/) {
$replac = parse_macro($body, $env);
last SWITCH;
}
if (/^$/) {
my $varname = parse_macro($body, $env);
$replac = $$env{$varname};
if (!defined($replac)) {
lwarn "variable '$varname' is undefined\n" unless defined($failed{$varname});
$failed{$varname} = 1;
$replac = "UNDEFINED($varname)";
}
lprint " subst '$systemd_escape\{$varname\}' => '$replac'\n" if $verbose > 2;
last SWITCH;
}
if (/^verbatim$/) {
$replac = $body;
last SWITCH;
}
lwarn "systemd function '$name' is undefined\n";
$replac = $body;
}
if ($do_extra_escape || $this_escape) {
my $orig = $replac;
$replac = _systemd_escape($replac);
lprint " escape '$orig' => '$replac'\n" if $verbose > 2;
}
$parsed .= $replac;
$text = $rest;
}
return ($env, $parsed . $text);
}
sub match_systemd_vars {
my ($env, $pattern, $text) = @_;
($env, $pattern) = subst_systemd_vars($env, $pattern, 1);
($env, $text) = subst_systemd_vars($env, $text, 1);
my @names;
my $regex = "";
while ($pattern =~ m/[$systemd_incape][{]([A-Za-z_][A-Za-z0-9_]*)[}]/ps) {
my $name = $1;
my $pre = $PREMATCH;
my $post = $POSTMATCH;
push @names, $name;
$regex .= $pre . "(.+)";
$pattern = $post;
}
$regex .= $pattern;
$regex =~ s/\\/\\\\/g;
my $nr = 1;
if ($text =~ m/^$regex$/p) {
foreach my $name (@names) {
my $val;
eval "\$val = \$$nr;";
$$env{$name} = $val;
$nr++;
}
return ($env, $text);
}
return (undef, $text);
}
my %referenced_units;
sub _instantiate_systemd_unit {
my ($env, $template_file, $subst) = @_;
($env, my $replac) = subst_systemd_vars($env, $subst, 1);
my $outfile = "$systemd_var_dir.new/$replac";
chomp $outfile;
lprint "==== Translate systemd template '$template_file' => '$outfile'\n" if $verbose;
my $text = "";
{
local $/; # slurp
if (!open(IN, "< $template_file")) {
lwarn "cannot open system template file '$template_file'\n";
return (0, $outfile);
}
$text = <IN>;
close(IN);
}
($env, $text) = subst_systemd_vars($env, $text, 0);
my $scan = $text;
while ($scan =~ m/^\s*($systemd_dependencies)\s*=\s*(.*?)$/mp) {
my $next_unit_list = $2;
$scan = $POSTMATCH;
while ($next_unit_list =~ m/[^\s]+/) {
my $next_unit = $MATCH;
$next_unit_list = $POSTMATCH;
# some units like mount units may be specified as paths.
$next_unit = _systemd_escape($next_unit) if $next_unit =~ m:/:;
lprint "-- '$template_file' found reference to '$next_unit'\n" if $verbose > 2;
# Remember the encountered name
$referenced_units{$next_unit} = 1;
}
}
if (open(IN, "< $outfile")) {
# Check whether something has changed
local $/; # slurp
my $old = <IN>;
close(IN);
if (defined($old) && $old eq $text) {
lprint "== systemd unit '$outfile' has not changed\n" if $verbose;
return (0, $outfile);
}
}
if (!open(OUT, "> $outfile.tmp")) {
lwarn "cannot create '$outfile'\n";
return (0, $outfile);
}
print OUT $text;
close(OUT);
rename("$outfile.tmp", $outfile);
return (1, $outfile, $replac);
}
sub make_systemd_unit {
my ($cmd, $res, $target) = @_;
my @res_list;
if (defined($res)) {
@res_list = ($res);
} else {
@res_list = get_member_resources();
}
my ($found_env, $found_template_file, $found_subst);
search:
foreach my $template_file (get_template_files()) {
my $template_name = $template_names{$template_file};
next unless $template_name;
foreach my $res (@res_list) {
($template_name, my $env) = make_env($cmd, $res, $template_name);
my $subst = $template_name;
(my $new_env, $subst) = match_systemd_vars($env, $template_name, $target);
if ($new_env) {
($found_env, $found_template_file, $found_subst) = ($new_env, $template_file, $subst);
last search;
} elsif ($subst) {
# Check if already installed somewhere else
get_systemd_files();
if (defined($systemd_files{$subst})) {
lprint "systemd unit '$subst' is already present at '$systemd_files{$subst}'\n" if $verbose;
return 0;
}
}
}
}
if (!$found_template_file) {
lwarn "cannot find any systemd template for target unit '$target'\n";
return 0;
}
my ($nr, $file, $name) = _instantiate_systemd_unit($found_env, $found_template_file, $found_subst);
if ($nr) {
$systemd_names{$file} = $name;
$systemd_files{$name} = $file;
}
return $nr;
}
sub set_systemd_want {
my ($cmd, $res, $new) = @_;
if ($new ne "(none)") {
my $host_path = "$mars/ips/ip-$new";
unless (get_link($host_path, 1)) {
lwarn "The hostname '$new' does not exist.\n";
ldie "Refusing to set systemd-want.\n" unless $force;
}
}
my $unit_glob = "$mars/resource-$res/systemd-*-unit";
unless (lamport_glob($unit_glob)) {
lwarn "Resource '$res' has no configured systemd units.\n";
lwarn "First configure the resource with marsadm set-systemd-unit.\n";
ldie "Refusing to set systemd-want.\n" unless $force;
}
my $want_path = "$mars/resource-$res/systemd-want";
set_link($new, $want_path);
finish_links();
_systemd_trigger();
}
sub systemd_present {
my ($cmd, $res) = @_;
my $unit_glob = "$mars/resource-$res/systemd-*-unit";
return "" unless lamport_glob($unit_glob);
my $want_path = "$mars/resource-$res/systemd-want";
my $want = get_link($want_path, 2);
return $want;
}
sub get_systemd_want {
my $txt = systemd_present(@_);
lprint "$txt\n";
}
sub systemd_exists {
my ($unit_list) = @_;
foreach my $unit (split(/ +/, $unit_list)) {
my $check_cmd = "list-unit-files \"$unit\" | wc -l";
my $count = `$systemctl $check_cmd`;
if (!defined($count) || !$count || $count <= 0) {
lprint "nothing to do for systemd, unit file '$unit' does not exist.\n";
return 0;
}
}
return 1;
}
sub systemd_enabled {
my ($unit_list) = @_;
foreach my $unit (split(/ +/, $unit_list)) {
my $check_cmd = "is-enabled '$unit' > /dev/null 2>&1";
my $status = systemctl($check_cmd);
if ($status) {
lprint "systemd unit '$unit' is not existing or not enabled.\n";
return $status;
}
}
return 0;
}
sub _systemd_op {
my ($op, $unit) = @_;
if (systemctl("cat '$unit' > /dev/null 2>&1")) {
lwarn "systemd unit $unit does not exist.\n";
return 0;
}
my $ctl_cmd = "is-failed --quiet '$unit'";
my $ok = systemctl($ctl_cmd);
if (!$ok) {
my $ctl_cmd = "reset-failed '$unit'";
my $status = systemctl($ctl_cmd);
lprint "--- resetting failed unit '$unit': status=$status\n";
}
if ($op eq "start" || $op eq "restart") {
if (systemd_enabled($unit)) {
return 0;
}
}
$ctl_cmd = "$op '$unit'";
lprint "--- running systemd command: $ctl_cmd\n";
my $status = systemctl($ctl_cmd);
if ($status) {
lwarn "command '$systemctl $ctl_cmd' failed, status=$status\n";
} else {
lprint "--- systemctl status=$status\n";
}
return $status;
}
sub systemd_activate {
my ($cmd, $res, $override, $fail_abort) = @_;
my $want_path = "$mars/resource-$res/systemd-want";
my $want = get_link($want_path, 2);
my $want_host_path = "$mars/resource-$res/userspace/systemd-want-$host";
if (!$want) {
lprint "Nothing to (de)activate: $want_path does not exist\n" if $verbose;
unlink($want_host_path);
return 0;
}
my $do_activate = $want eq $host;
if ($do_activate) {
# Check attach switch
my $path = "$mars/resource-$res/todo-$host/attach";
if (!get_link($path, 1)) {
$do_activate = 0;
}
}
if ($do_activate) {
my $primary = _get_designated_primary($res);
if ($primary ne $host) {
# Do not activate for now
lprint "Overriding systemd-want: '$host' is not designated primary,\n";
lprint "==== Do not modify resource '$res'\n" if $verbose;
return 0;
}
}
if (defined($override) && $override != $do_activate) {
lprint "Overriding unit activate=$do_activate with $override\n" if $verbose;
$do_activate = $override;
}
my $dev = "/dev/mars/$res";
if ($do_activate && ! -b $dev) {
lprint "Device $dev not present, cannot activate systemd unit\n" if $verbose;
$do_activate = 0;
}
my $oper = $do_activate ? "start" : "stop";
my $unit_path = "$mars/resource-$res/systemd-$oper-unit";
my $unit = get_link($unit_path, 2);
if (!$unit) {
lprint "Nothing to (de)activate: $unit_path does not exist\n" if $verbose;
unlink($want_host_path);
return 0;
}
my $ctl_cmd = "show \"$unit\"";
my $op = "show";
if ($do_activate) {
system("touch $want_host_path");
$unit =~ s/ .*//;
if (systemd_enabled($unit)) {
return 0;
}
lprint "==== Activate resource '$res' unit '$unit'\n"if $verbose;
$op = "start";
} else {
unlink($want_host_path);
$unit =~ s/.* //;
lprint "==== Deactivate resource '$res' unit '$unit'\n"if $verbose;
$op = "stop";
}
my $status = _systemd_op($op, $unit);
my $response_path = "$mars/resource-$res/userspace/systemd-status-$oper-$host";
set_link($status, $response_path);
finish_links();
if ($status && defined($fail_abort) && $fail_abort) {
lwarn "systemd unit '$unit' operation $op failed, status=$status\n";
return 0;
}
return 1;
}
sub __systemd_commit {
my %changes;
my %act_files = get_instance_files($systemd_target_dir);
my %old_files = get_instance_files($systemd_var_dir);
my %new_files = get_instance_files("$systemd_var_dir.new");
foreach my $old_target (sort alphanum_cmp keys(%old_files)) {
next if defined($new_files{$old_target});
next if !defined($act_files{$old_target});
lprint "-- marking '$old_target' for removal\n" if $verbose > 2;
$changes{$old_target} = -1;
}
system("rm -rf \"$systemd_var_dir.old\"");
system("mv \"$systemd_var_dir\" \"$systemd_var_dir.old\"");
system("mv \"$systemd_var_dir.new\" \"$systemd_var_dir\"");
if (system("cp -a $systemd_var_dir/* \"$systemd_target_dir\"")) {
lwarn "Cannot copy new unit instances from '$systemd_var_dir' to '$systemd_target_dir'\n";
return ();
}
foreach my $new_target (sort alphanum_cmp keys(%new_files)) {
next if defined($old_files{$new_target});
lprint "-- enabling new '$new_target'\n" if $verbose > 2;
my $unit = `basename "$new_target"`;
chomp $unit;
_systemd_op("enable", $unit);
}
return %changes;
}
sub __systemd_generate_all {
my ($cmd) = @_;
return unless -d $mars;
return unless -d $systemd_target_dir;
system("rm -rf \"$systemd_var_dir.new\"");
system("mkdir -p \"$systemd_var_dir.new\"");
system("mkdir -p \"$systemd_var_dir\"");
return unless -d $systemd_var_dir;
return unless -d "$systemd_var_dir.new";
# Determine all template files.
get_template_files();
# Always add all plain templates
%referenced_units = ();
my $count = 0;
foreach my $template_name (sort alphanum_cmp keys(%template_files)) {
next if $template_name =~ m/($systemd_incape|$systemd_escape)/;
$count += make_systemd_unit($cmd, "UNDEFINED_RESOURCE", $template_name);
}
# Determine all participating resource names.
my @res_list = get_member_resources();
# Create initial systemd units
foreach my $res (@res_list) {
foreach my $unit_link (lamport_glob("$mars/resource-$res/systemd-*-unit")) {
my $target = get_link($unit_link);
$count += make_systemd_unit($cmd, $res, $target);
}
}
# Compute the transitive closure of referenced units
lprint "== adding transitive units for $count start units.\n" if $verbose;
my %done_units;
for (;;) {
my $old_count = $count;
foreach my $target (sort alphanum_cmp keys(%referenced_units)) {
next if $done_units{$target};
$count += make_systemd_unit($cmd, undef, $target);
$done_units{$target} = 1;
}
last if ($count <= $old_count);
}
lprint "== $count units have changed.\n" if $verbose;
# Check and commit the new situation
my %changes = __systemd_commit();
return %changes;
}
sub __systemd_commit_ops {
my $cmd = shift;
my %changes = @_;
my $deleted = 0;
foreach my $target (sort alphanum_cmp keys(%changes)) {
my $action = $changes{$target};
if ($action < 0) {
lprint "Removing old template instance '$target'\n" if $verbose;
_systemd_op("stop", $target);
system("rm -f \"$systemd_target_dir/$target\"");
$deleted++;
}
}
lprint "== $deleted units have been removed.\n" if $verbose;
lprint "==== Restart systemd\n"if $verbose;
foreach my $unit (@systemctl_enable) {
_systemd_op("enable", $unit);
}
systemctl("daemon-reload");
# Activate all *.path triggers
for my $unit_path (lamport_glob("$systemd_target_dir/*mars*.path")) {
my $unit = `basename "$unit_path"`;
chomp $unit;
lprint "==== Activate path watcher '$unit'\n"if $verbose;
_systemd_op("start", $unit);
}
my $varfile = "$marsadm_var_dir/systemd.status";
system("mv $varfile.tmp $varfile");
}
sub __systemd_activate_ops {
my $cmd = shift;
# Activate the listed units.
my @res_list = get_member_resources();
foreach my $res (@res_list) {
systemd_activate($cmd, $res);
}
# Start standard units
foreach my $unit (@systemctl_start) {
_systemd_op("start", $unit);
}
}
sub __systemd_fingerprint {
my () = @_;
my $text = "";
# Fingerprint all source templates
get_template_files();
foreach my $template_file (sort alphanum_cmp keys(%template_names)) {
my ($dev,$ino,$mode,$nlink,$uid,$gid,$rdev,$size,
$atime,$mtime,$ctime,$blksize,$blocks) = stat($template_file);
$text .= "$size,$mtime,$template_file\n";
}
# Fingerprint all resources
$text .= "#\n";
my @res_list = get_member_resources();
foreach my $res (@res_list) {
$text .= "$res\n";
my $unit_glob = "$mars/resource-$res/systemd-*-unit";
foreach my $unit_link (lamport_glob($unit_glob)) {
$text .= get_link($unit_link, 1) . "\n";
}
}
# Fingerprint resulting templates (protect against external modifications)
$text .= "#\n";
foreach my $unit_file (lamport_glob($systemd_target_dir)) {
my ($dev,$ino,$mode,$nlink,$uid,$gid,$rdev,$size,
$atime,$mtime,$ctime,$blksize,$blocks) = stat($unit_file);
$text .= "$size,$mtime,$unit_file\n";
}
return $text;
}
sub is_systemd_generate_necessary {
my $cmd = shift;
my $text = __systemd_fingerprint();
system("mkdir -p $marsadm_var_dir");
my $varfile = "$marsadm_var_dir/systemd.status";
my $old_text = "";
local $/; # slurp
if (open(IN, "< $varfile")) {
$old_text = <IN>;
close(IN);
} else {
lwarn "file '$varfile' does not yet exist\n";
}
if ($text eq $old_text) {
lprint "systemd template generation not necessary.\n" if $verbose;
return 0 unless $force;
lprint "Forcing template generation...\n";
}
if (!open(OUT, "> $varfile.tmp")) {
lwarn "cannot open '$varfile.tmp'\n";
return 1;
}
print OUT $text;
close(OUT);
return 1;
}
sub __systemd_trigger {
my $cmd = shift;
# ensure that trigger file exists
my $trigger = "$mars/userspace/systemd-trigger";
lprint "Triggering '$trigger' for '$cmd'\n" if $verbose;
system("touch $trigger");
}
sub _systemd_trigger {
my ($cmd) = @_;
my $needed_unit = $systemctl_start[0];
if (!systemd_exists($needed_unit)) {
return;
}
if (!systemctl("cat '$needed_unit' > /dev/null 2>&1")) {
if (systemctl("status '$needed_unit' > /dev/null 2>&1")) {
systemctl("enable '$needed_unit'");
systemctl("start '$needed_unit'");
}
}
if (systemd_enabled($needed_unit)) {
return;
}
systemd_lock();
if (is_systemd_generate_necessary($cmd)) {
__systemd_activate_ops($cmd);
lprint "Direct template generation\n" if $verbose;
my %changes;
# Continue with unlock in case of any deaths inbetween
eval {
%changes = __systemd_generate_all($cmd);
};
__systemd_commit_ops($cmd, %changes);
}
__systemd_activate_ops($cmd);
systemd_unlock();
}
sub systemd_trigger {
my ($cmd) = @_;
return unless -d $systemd_target_dir;
systemd_lock();
if (is_systemd_generate_necessary($cmd)) {
my %changes;
# Continue with unlock in case of any deaths inbetween
eval {
%changes = __systemd_generate_all($cmd);
};
__systemd_commit_ops($cmd, %changes);
}
__systemd_activate_ops($cmd);
systemd_unlock();
}
sub _get_systemd_unit {
my ($cmd, $res) = @_;
my $unit_start_path = "$mars/resource-$res/systemd-start-unit";
my $unit_stop_path = "$mars/resource-$res/systemd-stop-unit";
my $start_unit = get_link($unit_start_path, 2);
my $stop_unit = get_link($unit_stop_path, 2);
if (!$start_unit) {
lprint_stderr "Nothing to show: start unit $unit_start_path does not exist\n" if $verbose;
return "";
}
if (!$stop_unit) {
lwarn "Strange: stop unit $unit_stop_path does not exist\n";
return $start_unit;
}
return "$start_unit $stop_unit";
}
sub get_systemd_unit {
my $unit = _get_systemd_unit(@_);
lprint "$unit\n";
}
sub set_systemd_unit {
my ($cmd, $res, $start_name, $stop_name) = @_;
ldie "Start unit name is undefined\n" unless defined($start_name);
$stop_name = $start_name unless defined($stop_name);
my $unit_start_path = "$mars/resource-$res/systemd-start-unit";
my $unit_stop_path = "$mars/resource-$res/systemd-stop-unit";
# Convenience: try to fix any non-escaped resource names
$start_name =~ m/($res)/p;
if ($1) {
my $pre = $PREMATCH;
my $post = $POSTMATCH;
my $replac = _systemd_escape($res);
$start_name = $pre . $replac . $post;
}
$stop_name =~ m/($res)/p;
if ($1) {
my $pre = $PREMATCH;
my $post = $POSTMATCH;
my $replac = _systemd_escape($res);
$stop_name = $pre . $replac . $post;
}
my $want_path = "$mars/resource-$res/systemd-want";
if ($start_name) {
set_link($start_name, $unit_start_path);
my $primary = _get_designated_primary($res, 1);
if ($primary) {
set_link($primary, $want_path);
}
} else {
_create_delete($unit_start_path);
_create_delete($want_path);
}
if ($stop_name) {
set_link($stop_name, $unit_stop_path);
} else {
_create_delete($unit_stop_path);
_create_delete($want_path);
}
finish_links();
systemd_trigger($cmd);
}
##################################################################
# path correction
sub correct_path {
my ($path) = @_;
# actual switches
$path =~ s:(/is-attach)[a-z]*$:$1ed:;
$path =~ s:(/is-)(fetch)[a-z]*$:$1copy:;
$path =~ s:(/is-)(apply)[a-z]*$:$1replay:;
$path =~ s:(/is-(copy|replay|sync))[a-z]*$:$1ing:;
# todo switches
$path =~ s:(/fetch)[a-z]*$:/connect:;
$path =~ s:(/apply)[a-z]*$:/allow-replay:;
$path =~ s:(/replay)[a-z]*$:/allow-replay:;
return $path;
}
##################################################################
# low-level infrastructure
my @link_list = ();
my %link_hash;
sub get_link {
my ($path, $unchecked) = @_;
my $result = readlink($path);
if (!defined($result)) {
ldie "cannot read symlink '$path'\n" unless $unchecked;
lwarn "cannot read symlink '$path'\n" if $unchecked == 1;
$result = "";
}
$result = "" if $result eq ".deleted";
return $result;
}
sub get_link_stamp {
my ($path) = @_;
my @stat = lstat($path);
return 0 if (!@stat);
return $stat[9];
}
sub is_link_recent {
my ($path, $wind) = @_;
$wind = $window * 2 unless defined($wind);
my @stat = lstat($path);
return 0 if (!@stat);
return 1 if $stat[9] + $wind >= mars_time();
return 0;
}
sub to_tmp {
my $path = shift;
$path =~ s:^(.*)/:$1/.tmp.$$.:;
return $path;
}
sub set_link {
my ($src, $dst) = @_;
# safeguard trailing slashes
$dst =~ s:/$::;
my $dst_tmp = to_tmp($dst);
unlink($dst_tmp);
symlink($src, $dst_tmp) or ldie "cannot create symlink '$dst' -> '$src'\n";
# the _order_ is important! remove existing intermediate element before re-appanding
if (exists($link_hash{$dst})) {
my @copy = @link_list;
@link_list = ();
foreach my $elem (@copy) {
next if $elem eq $dst;
push @link_list, $elem;
}
}
$link_hash{$dst} = $src;
push @link_list, $dst;
}
sub finish_links {
return unless @link_list;
my $timestamp = mars_time();
lprint "using lamport timestamp $timestamp\n" if $verbose;
my $trigger_code = 1;
my $count = 0;
while (my $link = shift @link_list) {
my $link_tmp = to_tmp($link);
my $target = readlink($link_tmp);
next unless defined($target);
my $this_timestamp = $timestamp;
# allow overriding of secondaries in partitioned clusters by use of small timestamps
if ($target eq "(none)") {
my @stat = lstat($link);
$this_timestamp = $stat[9] + 1 if @stat;
}
unless (system("touch -h -d \"\@$this_timestamp\" $link_tmp") == 0) {
lwarn "cannot set mtime on symlink '$link_tmp'\n";
}
if ($dry_run) {
lprint "DRY_RUN: would create symlink '$link' -> '$target'\n";
unlink($link_tmp);
next;
}
# allow replacement of directories with symlinks
rmdir($link) if -d $link;
unless (rename($link_tmp, $link)) {
lwarn "cannot finalize symlink '$link'\n";
} elsif ($verbose) {
lprint "created symlink '$link' -> '$target'\n";
}
$count++;
$trigger_code = 2 if $link =~ m:/(primary|todo-global|ip):;
}
_trigger($trigger_code) if $count > 0;
}
##################################################################
# global checks
sub get_global_versions {
unless (defined($ARGV[0]) && $ARGV[0] =~ m/cluster|cat/) {
$kernel_version = get_link("$mars/tree-$host", 1);
if ($kernel_version && $user_version != $kernel_version) {
lwarn "kernel_version=$kernel_version user_version=$user_version\n";
if ($user_version < $kernel_version) {
ldie "Sorry, your MARS kernel module uses version $kernel_version, but my $0 userspace version is only $user_version. That cannot work. Please upgrade your userspace scripts!\n";
} elsif (int($user_version) > int($kernel_version) + 1) {
ldie "MAJOR VERSION mismatch : only one major upgrade step is supported.\n";
}
lwarn "using different minor versions is possible, but you should upgrade your kernel module ASAP\n";
}
}
# compute the mimimum of kernel features capabilities
foreach my $link (lamport_glob("$mars/features-*")) {
my $features = get_link($link);
next unless (defined($features) && $features);
$features =~ m/^([0-9]+)/;
$features = $1;
next unless $features > 0;
if (!$kernel_features_version || $features < $kernel_features_version) {
$kernel_features_version = $features;
}
}
my $compat_path = "$mars/compat-deletions";
$compat_deletions = get_link($compat_path, 2);
if ($kernel_features_version < 3 || !defined($compat_deletions) || $compat_deletions eq "") {
$compat_deletions = 1;
}
}
sub get_alive_links {
my $res = shift || "all";
my $alive = shift || "alive";
my $hosts = shift || "*";
my $warn = shift || 0;
my $non_participating = shift || 0;
$res = "*" if $res eq "all";
my %cand;
foreach my $path (lamport_glob("$mars/ips/ip-$hosts")) {
$path =~ m:/ip-(.*):;
my $peer = $1;
$cand{$peer} = 1;
}
my %peers;
foreach my $path (lamport_glob("$mars/resource-$res/data-*")) {
$path =~ m:/data-(.*):;
my $peer = $1;
next unless $peer;
next if $peer eq $host;
# avoid O(n^2) globbing
next if $peers{$peer};
# peer must be a candiate matching the hosts spec
next unless $cand{$peer};
# peer must be participating in the same resources
if (!$non_participating) {
my @other = lamport_glob("$mars/resource-$res/data-$peer");
next unless @other;
# I must be participating in some of the _same_ resources
my $common = 0;
foreach my $check (@other) {
my $self = `dirname $check`;
chomp $self;
$self .= "/data-$host";
$common++ if any_exists($self);
}
next unless $common;
}
# OK: remember peer
$peers{$peer} = 1;
}
return %peers unless %peers;
my $glob;
if (scalar(keys(%peers)) == 1) {
$glob = "$mars/$alive-" . join(',', keys(%peers));
} else {
$glob = "$mars/$alive-{" . join(',', keys(%peers)) . "}";
}
my %links;
foreach my $path (lamport_glob($glob)) {
$path =~ m:/$alive-(.+):;
my $peer = $1;
$links{$peer} = get_link($path, 1);
}
if ($warn) {
foreach my $peer (keys(%peers)) {
my $path = "$mars/$alive-$peer";
if (!is_link_recent($path)) {
my $stamp = get_link_stamp($path);
my $age = seconds2human(mars_time() - $stamp);
my $msg = "no metadata is arriving from peer '$peer', age = $age";
$msg .= " => check your network setup" if is_module_loaded();
lwarn "$msg\n";
}
}
}
return %links;
}
##################################################################
# timeout handling
#
# return the lamport clock time in nanosecond resolution
# fallback to system time()
#
sub mars_time {
my $lamport_time;
if (open(my $lamport_clock, "<", "/proc/sys/mars/lamport_clock")) {
while (<$lamport_clock>) {
$lamport_time = $1 if /^lamport_now=(.*)/;
}
close($lamport_clock);
}
return $lamport_time || time() . "." . '0' x 9;
}
my $timeout_val = undef;
sub sleep_timeout {
my $sleeptime = shift || 1;
my $continue = shift;
if ($timeout < 0) {
sleep($sleeptime);
return 0;
}
if (!defined($timeout_val)) {
$timeout_val = $timeout;
lprint "Resetting timeout to $timeout\n" if $verbose > 0;
}
if ($timeout_val <= 0) {
$timeout_val = undef;
if (!defined($continue) || !$continue) {
ldie "Timeout reached.\n";
}
lwarn "Timeout reached. Continuing anyway.\n";
return 1;
}
my $rest = $timeout_val;
$rest = $sleeptime if $rest > $sleeptime;
sleep($rest);
$timeout_val -= $rest;
return 0;
}
# wait for some condition
sub wait_cond {
my ($cmd, $res, $specific) = @_;
my $is_actual = ($specific =~ s/^(is|has)-//);
$specific =~ s/^todo-//;
my $is_on = !($specific =~ s/-(off|0)$//);
$specific =~ s/-(on|1)$//;
if ($is_actual) {
if ($specific eq "device") {
check_mars_device($cmd, $res, 1, !$is_on);
return;
}
my %table =
(
"attach" => "is-attached",
"attached" => "is-attached",
"replay" => "is-replaying",
"replaying"=> "is-replaying",
"fetch" => "is-copying",
"fetching" => "is-copying",
"copy" => "is-copying",
"copying" => "is-copying",
"sync" => "is-syncing",
"syncing" => "is-syncing",
"primary" => "is-primary",
);
my $name = $table{$specific};
ldie "actual indicator '$specific' does not exist\n" unless exists($table{$specific});
check_status($cmd, $res, $name, $is_on ? 1 : 0, 1, 1);
} else {
my %table =
(
"attach" => "attach",
"attached" => "attach",
"fetch" => "fetch",
"connect" => "fetch",
"replay" => "replay",
"sync" => "sync",
);
my $name = $table{$specific};
ldie "button '$specific' does not exist\n" unless exists($table{$specific});
check_todo($cmd, $res, $name, $is_on ? 1 : 0, 1);
}
}
# wait until everything is recent
sub wait_cluster {
my $cmd = shift;
my $res = shift || "all";
my $hosts = shift || "*";
my $abort = shift;
$abort = $force unless defined($abort);
my $trigger_code = shift || ($hosts =~ m/\*/ ? 3 : 2);
my $non_participating = ($trigger_code >= 8);
$timeout_val = undef;
finish_links();
lprint "Ping $hosts\n";
_trigger($trigger_code);
my %old_status = get_alive_links($res, "time", $hosts, 0, $non_participating);
my $total_count = scalar(keys(%old_status));
my $start_time = mars_time();
my %progress;
foreach my $peer (keys(%old_status)) {
$progress{$peer} = $non_participating ? 1 : 0;
}
lprint "Wait for answers from " . join(",", sort alphanum_cmp keys(%old_status)) . "\n" if %old_status;
my $delta = $window > 0 ? $window / 2 : 30;
while (1) {
my $dead_count = 0;
my $alive_count = 0;
my $unknown_count = 0;
my %status = get_alive_links($res, "time", $hosts, 0, $non_participating);;
my $now = mars_time();
foreach my $peer (keys(%status)) {
next if $peer eq $host;
if ($status{$peer} > $old_status{$peer} &&
$status{$peer} > $start_time &&
$progress{$peer}-- <= 0) {
$alive_count++;
} elsif ($status{$peer} + $delta < $now) {
$dead_count++;
} else {
$unknown_count++;
}
}
if ($unknown_count) {
_trigger($trigger_code);
# ensure more progress will happen
%old_status = %status;
} else {
if (!$dead_count) {
lprint "$alive_count/$total_count peer(s) seem to be alive\n";
} else {
lwarn "$alive_count/$total_count peer(s) seem to be alive, and $dead_count peer(s) seem to be dead / not reachable\n";
ldie "aborting (--force was not given)\n" if $abort;
}
last;
}
sleep_timeout();
last if $timeout_val <= 0 && !$unknown_count;
}
}
sub wait_cluster_noforce {
my ($cmd, $res) = @_;
if (!$force) {
lprint "WAITING for communication\n" if $verbose;
wait_cluster($cmd, $res, "*", 0);
}
}
sub update_cluster {
my $cmd = shift;
my $res = shift || "all";
lprint "UPDATING $res\n" if $verbose;
wait_cluster($cmd, $res, "*", 0, 8);
}
sub is_cluster_recent {
my ($cmd, $res, $hosts) = @_;
my $dead_count = 0;
my $alive_count = 0;
my $unknown_count = 0;
my $now = mars_time();
my %status = get_alive_links($res, "time", $hosts);
foreach my $peer (keys(%status)) {
next if $peer eq $host;
if ($status{$peer} + $window/2 >= $now) {
$alive_count++;
} elsif ($status{$peer} + $window < $now) {
$dead_count++;
} else {
$unknown_count++;
}
}
return ($dead_count, $alive_count, $unknown_count);
}
sub recent_cluster {
my $cmd = shift;
my $res = shift || "all";
my $hosts = shift || "*";
my ($dead_count, $alive_count, $unknown_count) = is_cluster_recent($cmd, $res, $hosts);
return 1 if (!$dead_count && !$unknown_count);
wait_cluster($cmd, $res, $hosts, 0);
return 0;
}
##################################################################
# syntactic checks
# (also check for existence)
sub check_id {
my ($str, $must_exist) = @_;
ldie "identifier '$str' is empty" unless defined($str) && $str;
ldie "identifier '$str' has disallowed characters" unless $str =~ m/^[A-Za-z0-9_][-A-Za-z0-9_]*$/;
ldie "identifier '$str' is too long (only 63 chars supported according to RFC 1123)" if length($str) > 63;
if (defined($must_exist) && $must_exist) {
my $ip_path = "$mars/ips/ip-$str";
unless (get_link($ip_path, 1)) {
ldie "host '$str' does not exist in $mars/ips/\n" if !$force;
lwarn "host '$str' does not exist in $mars/ips/ - hopefully you know what you are doing via --force\n";
}
}
}
sub check_id_list {
my ($str, $must_exist) = @_;
ldie "comma-separated list '$str' has disallowed characters" unless $str =~ m/^[A-Za-z0-9_][-A-Za-z0-9_,]*$/;
foreach my $id (split(",", $str)) {
check_id($id, $must_exist);
}
}
##################################################################
# semantic checks
sub check_res {
my $res = shift;
wait_cluster("wait-cluster", "all", "*", 0) unless -d "$mars/resource-$res";
if (not -d "$mars/resource-$res") {
# Ensure that all cluster information is present
# DO WHAT I MEAN: try to substitute a device name for a badly given resource name if it is unique
my $count = 0;
my $found;
my @tests = lamport_glob("$mars/resource-*/device-$host");
foreach my $test (@tests) {
my $target = get_link($test, 2);
if ($target eq $res) {
$found = $test;
$count++;
}
}
if (!$count) {
@tests = lamport_glob("$mars/resource-*/_direct-*-$host");
foreach my $test (@tests) {
my $target = get_link($test, 2);
$target =~ s/^.*,//;
if ($target eq $res) {
$found = $test;
$count++;
}
}
}
ldie "resource '$res' does not exist ($count replacements found)\n" unless $count == 1 and $found;
$found =~ s:^.*/resource-(.*)/.*$:$1:;
lwarn "substituting bad resource name '$res' by uniquely matching resource name '$found'\n";
$res = $found;
}
return $res;
}
sub _get_mars_size {
my ($cmd, $res) = @_;
my $dev_name = get_link("$mars/resource-$res/device-$host");
my $info = "/sys/devices/virtual/block/mars!$dev_name/size";
return `cat $info` * 512;
}
sub check_sizes {
my ($res, $peer) = @_;
my $physical_size = get_link("$mars/resource-$res/actsize-$peer", 2) || return;
my $logical_size = get_link("$mars/resource-$res/size", 1);
if (defined($physical_size) && $physical_size < $logical_size) {
lwarn "Physical device on host '$peer' has size $physical_size, which is smaller than the logical resource size $logical_size\n";
ldie "This is too dangerous. It cannot work. Fix it!\n" unless $force;
}
}
sub check_res_member {
my ($cmd, $res) = @_;
if (! link_exists("$mars/resource-$res/data-$host")) {
if (link_exists("$mars/resource-$res/replay-$host")) {
lwarn "Resource '$res' seems to have been destroyed.\n";
lwarn "Nevertheless, a replay link exists for host '$host'.\n";
lwarn "This can happen after 'leave-resource --host=$host' while host $host was active.\n";
lwarn "mars-user-manual.pdf forbids any usage of such a resource _strongly_.\n";
lwarn "In order to finally remove this resource from $host, use the\n";
lwarn "command 'marsadm leave-resource --force $res'\n";
} else {
lwarn "Sorry, I have not yet joined to resource '$res'\n";
}
ldie "Refusing work on resource name '$res'\n" unless $force;
lwarn "Running '$cmd' is dangerous, continuing on your own risk\n" unless $cmd eq "leave-resource";
}
check_sizes($res, $host);
}
sub _sync_finished {
my ($res, $peer) = @_;
my $lnk = "$mars/resource-$res/syncstatus-$peer";
return 0 unless link_exists($lnk);
my $syncstatus = get_link($lnk, 1);
my $size = get_link("$mars/resource-$res/size");
return 0 if ($size <= 0);
return 0 if ($syncstatus < $size);
my $syncpos = get_link("$mars/resource-$res/syncpos-$peer", 2);
if (defined($syncpos) && $syncpos) {
my $replay = get_link("$mars/resource-$res/replay-$peer", 1);
return 0 unless $replay;
my $cmp = compare_replaylinks($syncpos, $replay);
return 0 if $cmp > 0;
}
return 1;
}
sub check_sync_finished {
my ($res, $peer, $do_force) = @_;
check_sizes(@_);
if (!_sync_finished($res, $peer)) {
lwarn "Sync has not yet finished on host '$peer'\n";
if ($peer eq $host) {
lwarn "Don't try to make inconsistent host '$host' the new primary!\n";
lwarn "Please wait until sync has finished and all logfile have been replayed.\n";
ldie "Refusing to switch inconsistent host '$host' to primary\n";
} else {
lwarn "Changing the primary role during sync is dangerous for data consistency on host '$peer'!\n";
}
ldie "First wait for sync finished before trying to switch primary!\n" unless (defined($do_force) && $do_force);
}
lprint "OK, it seems that sync has finished on host '$peer'.\n";
}
sub check_primary {
my ($cmd, $res, $no_designated, $no_fail) = @_;
my $fail = 0;
my $lnk = "$mars/resource-$res/actual-$host/is-primary";
my $is_primary = get_link($lnk, 1);
if (!$is_primary) { # give it a second chance
my $name = get_link("$mars/resource-$res/device-$host", 1);
my $dev = "/dev/mars/$name";
$is_primary = 1 if -b $dev;
}
unless ($is_primary) {
lwarn "For operation '$cmd' I need to be primary\n";
$fail++;
}
if ($no_designated) {
my $primary = _get_designated_primary($res);
unless ($primary eq $host) {
lwarn "For operation '$cmd', I also must be the designated primary\n";
}
}
unless (defined($no_fail) && $no_fail) {
ldie "Operation '$cmd' only works at the primary side\n" if $fail;
}
return $fail;
}
sub check_not_primary {
my ($cmd, $res) = @_;
my $lnk = "$mars/resource-$res/actual-$host/is-primary";
my $is_primary = get_link($lnk, 1);
if ($is_primary) {
ldie "operation '$cmd' cannot be executed on primary\n" unless $force;
lwarn "operation '$cmd' is forced on actual primary '$host', THIS IS RISKY\n";
}
# also check whether we intend to become primary
my $primary = _get_designated_primary($res);
ldie "operation '$cmd' cannot be executed on designated primary\n" if $primary eq $host;
}
sub check_primary_gone {
my ($cmd, $res, $peer) = @_;
return 0 unless (defined($peer) && $peer && $peer ne "(none)");
return 0 if $peer eq $host;
my $lnk = "$mars/resource-$res/actual-$peer/is-primary";
my $is_primary = get_link($lnk, 1);
if (defined($is_primary) && $is_primary eq "0") {
lprint "OK, other host ($peer) is not actual primary.\n";
return 0;
}
lprint "waiting for other primary host ($peer) to disappear....\n";
return 1;
}
sub _make_messages {
my ($cmd, $res, $key, $val, $wait, $unchecked, $inv) = @_;
my %table =
(
"is-attached" => "attach",
"is-syncing" => "sync",
"is-copying" => "fetch",
"is-replaying" => "replay",
"is-primary" => "primary",
);
my $key_msg = defined($table{$key}) ? $table{$key} : $key;
my $val_msg = $val ? "on" : "off";
my $wait_msg = $wait ? "waiting $timeout until" : "ensuring that";
my $new_val = (defined($inv) && $inv) ? !$val : $val;
my $action_msg = ($new_val ? "'resume-" : "'pause-") . $key_msg . "'";
if ($key_msg eq "primary") {
$action_msg = $new_val ? "'$key_msg'" : "'secondary' (or better 'primary' on another host)";
} elsif ($key_msg eq "attach") {
$action_msg = $new_val ? "'$key_msg'" : "'detach'";
}
$action_msg = "Use $action_msg and/or wait until it has really succeeded.";
$action_msg .= " Notice that MARS is an ASYNCHRONOUS system, where both execution of actions, as well as propagation over the network may take some time." if $verbose;
$action_msg .= " Don't expect magic to happen if the network has some problems, or when the system load is extremely high." if $verbose > 1;
$action_msg .= " Please read the PDF manual, and try to understand it." if $verbose > 2;
return ($key_msg, $val_msg, $wait_msg, $action_msg);
}
sub check_todo {
my ($cmd, $res, $key, $val, $wait, $unchecked, $inv) = @_;
$key =~ s/fetch/connect/;
my $path = "$mars/resource-$res/todo-$host/$key";
$path = correct_path($path);
my $link;
for (;;) {
$link = get_link($path, $unchecked);
return unless defined($link);
my ($key_msg, $val_msg, $wait_msg, $action_msg) = _make_messages(@_);
if (defined($inv) && $inv) {
last if $link != $val;
lprint "$wait_msg switch '$key' != '$val'....\n";
ldie "Cannot execute $cmd on resource $res: todo-switch '$key_msg' must not be $val_msg. $action_msg\n" if !$wait;
} else {
last if $link == $val;
lprint "$wait_msg switch '$key' == '$val'....\n";
ldie "Cannot execute $cmd on resource $res: todo-switch '$key_msg' must be $val_msg, but actually has value '$link'. $action_msg\n" if !$wait;
}
sleep_timeout();
}
lprint "OK, '$path' has acceptable value '$link'\n";
}
sub _run_compensation {
my ($compensation) = @_;
if (defined($compensation)) {
my $txt = $compensation;
$txt =~ s/\n/\\n/mg;
lprint "Running compensation action '$txt'\n";
eval $compensation;
}
}
sub check_status {
my ($cmd, $res, $key, $val, $wait, $unchecked, $inv, $peer, $action, $compensation) = @_;
$peer = $host unless defined($peer);
my $path = correct_path("$mars/resource-$res/actual-$peer/$key");
my $link;
my $rounds = 0;
my $fail_round = 10;
for (;;) {
$link = get_link($path, $unchecked);
$link = 0 unless (defined($link) && $link ne "");
my ($key_msg, $val_msg, $wait_msg, $action_msg) = _make_messages(@_);
if (defined($inv) && $inv) {
last if $link != $val;
lprint "at $peer: $wait_msg actual '$key' != '$val'...\n";
ldie "Cannot execute $cmd on resource $res: actual '$key_msg' must not be $val_msg. $action_msg Also ensure that your command _can_ succeed.\n" if !$wait;
} else {
last if $link == $val;
lprint "at $peer: $wait_msg actual '$key' == '$val'...\n";
ldie "Cannot execute $cmd on resource $res: actual '$key_msg' must be $val_msg. $action_msg Also ensure that your command _can_ succeed.\n" if !$wait;
}
if (defined($action) && $action && $rounds > 1) {
lprint "action: $action\n" if $verbose;
my $action_status = 0;
my $old_error_count = $error_count;
eval "$action";
$error_count = $old_error_count;
# Tolerate intermediate failures for some time
if ($action_status && $rounds > $fail_round) {
_run_compensation($compensation);
ldie "Action failure, status=$action_status\n";
}
}
my $status = sleep_timeout(undef, 1);
if ($status) {
_run_compensation($compensation);
ldie "Timeout\n";
}
$rounds++;
}
lprint "OK at $peer: '$path' has acceptable value '$link'\n";
}
sub check_mars_device {
my ($cmd, $res, $wait, $inv) = @_;
my $name = get_link("$mars/resource-$res/device-$host", $inv);
my $dev = "/dev/mars/$name";
my $backoff = 1;
my $round = 0;
if ($inv) {
while (-b $dev) {
ldie "cannot execute $cmd: device '$dev' has not yet disappeared\n" if !$wait;
lwarn "device '$dev' has not yet disappeared\n";
sleep_timeout($backoff);
# very slowly increasing backoff
if ($backoff < 10 && $round++ > 5) {
$round = 0;
$backoff++;
}
systemd_activate($cmd, $res, 0, 1);
}
lprint "device '$dev' is no longer present\n" unless -b $dev;
return;
}
# !$inv
my $primary = _get_designated_primary($res);
ldie "for operation '$cmd', I should be the designated primary\n" unless $primary eq $host;
while (! -e $dev) {
my $text = get_error_text($cmd, $res);
lprint $text if $text;
ldie "aborting due to errors\n" if $text =~ m/error/mi;
ldie "cannot execute $cmd: device '$dev' not yet present\n" if !$wait;
lprint "device '$dev' not yet present\n";
sleep_timeout($backoff);
# very slowly increasing backoff
if ($backoff < 10 && $round++ > 5) {
$round = 0;
$backoff++;
}
}
lprint "device '$dev' is present\n" if -b $dev;
}
sub check_userspace {
my ($dst) = @_;
if ($dst !~ m:/userspace/:) {
ldie "your path '$dst' must be inside $mars/userspace/ or $mars/resource-*/userspace/\n" unless $force;
lwarn "your path '$dst' is outside $mars/userspace/ or $mars/resource-*/userspace/ and you gave --force, hopefully YOU KNOW WHAT YOU ARE DOING\n";
}
}
sub check_sync_startable {
my ($cmd, $res) = @_;
my $primary = _get_designated_primary($res);
ldie "Cannot execute '$cmd' because noone is designated as primary.\n" if (!$primary || $primary eq "(none)");
# no danger when switch is turned on at the primary side.
return if $primary eq $host;
my $emergency_path = "$mars/resource-$res/actual-$primary/has-emergency";
my $emergency = get_link($emergency_path, 1);
if ($emergency) {
ldie "Primary '$primary' is in emergency mode. Cannot start sync.\nFree some space there first.\n";
}
}
##################################################################
# state inspection routines
sub _get_minmax {
my ($res, $glob, $take_symlink) = @_;
my $min = -1;
my $max = -1;
my @paths = lamport_glob($glob) or lwarn "cannot find '$glob'\n";
foreach my $path (@paths) {
my $nr = $path;
if ($take_symlink) {
$nr = get_link($path, 1);
}
$nr =~ s@^(?:.*/)?[a-z]+-([0-9]+)(-[^/]*)?$@$1@;
$min = $nr if ($nr < $min || $min < 0);
$max = $nr if ($nr > $max || $max < 0);
}
return ($min, $max);
}
sub get_minmax_logfiles {
my ($res, $peer) = @_;
$peer = "" unless defined($peer);
return _get_minmax($res, "$mars/resource-$res/log-*$peer", 0);
}
sub get_minmax_versions {
my ($res, $peer) = @_;
$peer = "" unless defined($peer);
return _get_minmax($res, "$mars/resource-$res/version-*$peer", 0);
}
sub get_minmax_any {
my ($res, $peer) = @_;
$peer = "" unless defined($peer);
return _get_minmax($res, "$mars/resource-$res/{log,version}-*$peer", 0);
}
sub get_minmax_replays {
my ($res, $peer) = @_;
$peer = "" unless defined($peer);
return _get_minmax($res, "$mars/resource-$res/replay-*$peer", 1);
}
##################################################################
# generic comparisons
sub compare_replaylinks {
my ($a, $b) = @_;
$a =~ m/log-([0-9]+)[^,]*,([0-9]+)/;
my ($a_log, $a_pos) = ($1, $2);
$b =~ m/log-([0-9]+)[^,]*,([0-9]+)/;
my ($b_log, $b_pos) = ($1, $2);
return -1 if $a_log < $b_log;
return +1 if $a_log > $b_log;
return -1 if $a_pos < $b_pos;
return +1 if $a_pos > $b_pos;
return 0;
}
##################################################################
sub get_amount {
my ($resdir, $host, $direction, $only_files) = @_;
my $level = 0;
my $firstpos = 0;
my $logpos = 0;
my $oldpos = 0;
my $sum = 0;
for (;;) {
my $val0 = 0;
my $ok = 0;
if (!$level) {
my $replay_path = sprintf("%s/replay-%s", $resdir, $host);
my $replay_link = get_link($replay_path, 1);
return (0, 0, 0, 0) if !$replay_link;
return (0, 0, 0, 0) if $replay_link !~ m:log-([0-9]+)-[^,]+,([0-9]+),([0-9]+):;
$firstpos = int($1);
$logpos = $firstpos;
$oldpos = $firstpos;
if ($direction < 0) {
$sum = $2;
} else {
$val0 = $2;
$sum = $3;
}
}
if ($level > 0 || $direction > 0) {
my $file = sprintf("%s/log-%09d-%s", $resdir, $logpos, $host);
my @stat = stat($file);
my $val = 0;
if (@stat && file_exists($file)) {
$val = $stat[7];
$ok = 1;
} else {
my $glob = sprintf("%s/log-%09d-*", $resdir, $logpos);
foreach $file (lamport_glob($glob)) {
my @tstat = stat($file);
if (@tstat && $tstat[7] > $val) {
@stat = @tstat;
$val = $stat[7];
$ok = 1;
}
}
}
if (!$only_files) {
my $glob = sprintf("%s/version-%09d-*", $resdir, $logpos);
foreach $file (lamport_glob($glob)) {
my $vers_link = get_link($file, 1);
if ($vers_link && $vers_link =~ m;,([0-9]+):;) {
my $nval = $1;
$val = $nval if $nval > $val;
$ok = 1;
}
}
}
return ($sum, $firstpos, $oldpos, $level) unless $ok;
if ($level > 0) {
$sum += $val;
} else {
# the logfile may be bigger than the replay pos/length
my $new_sum = $val - $val0;
$sum = $new_sum if $new_sum > $sum;
$sum = 0 if $sum < 0;
}
}
$oldpos = $logpos;
if ($direction > 0) {
$logpos++;
} elsif ($direction < 0) {
$logpos--;
return ($sum, $firstpos, $oldpos, $level) if $logpos < 1;
} else {
return ($sum, $firstpos, $oldpos, $level);
}
$level++;
}
}
##################################################################
# versionlink path handling routines
my %visited_pos;
sub _visit {
my ($nr, $peer) = @_;
$nr =~ s:^0*::;
my $visit = "$nr,$peer";
$visited_pos{$visit} = 1;
}
sub _is_visited {
my ($nr, $peer) = @_;
$nr =~ s:^0*::;
my $visit = "$nr,$peer";
return $visited_pos{$visit};
}
sub _parse_pos {
my ($basedir, $pos) = @_;
if ($pos =~ m/log-([0-9]+)-([^,]+)/) {
_visit($1, $2);
} elsif ($pos =~ m/version-([0-9]+)-([^,]+)/p) {
my $vers = get_link("$basedir/$MATCH");
my $count_matches = 0;
while ($vers =~ m/log-([0-9]+)-([^,]+)/p) {
$vers = $POSTMATCH;
_visit($1, $2);
$count_matches++;
}
lwarn "cannot parse '$pos' -> '$vers'\n" unless $count_matches;
} else {
lwarn "cannot parse '$pos'\n";
}
$pos =~ m/((?:log|version)-([0-9]+)-([^,]+)(?:,([0-9]+))?)/ or lwarn "cannot parse position info '$pos'\n";
return ($1, int($2), $3, defined($4) ? int($4) : -1);
}
sub _get_prev_pos {
my ($basedir, $nr, $peer) = @_;
my $path = sprintf("version-%09d-$peer", $nr);
my $vers = get_link("$basedir/$path", 2);
if (!defined($vers) || !$vers) {
# Scarce race.
# This can happen when new logfiles are present but not yet worked on.
# Also improves robustness on damaged filesystems.
# Decrement nr by hand, try, take the first part.
$path = sprintf("version-%09d-$peer", $nr - 1);
$vers = get_link("$basedir/$path", 2);
$vers =~ s/:.*// if defined($vers) && $vers;
} else {
# take the last part, pointing to the predecessor versionlink.
$vers =~ s/^.*://;
}
_parse_pos($basedir, $path) if defined($vers) && $vers;
return $vers;
}
sub _get_common_ancestor {
my ($total1, $total2) = (0, 0);
for (;;) {
my ($basedir, $pos1, $host1, $dep1, $pos2, $host2, $dep2) = @_;
my ($p1, $nr1, $from1, $len1) = _parse_pos($basedir, $pos1);
my ($p2, $nr2, $from2, $len2) = _parse_pos($basedir, $pos2);
if ($p1 eq $p2) {
# usually no split brain here (only if both path depths are non-zero)
my $split = ($dep1 && $dep2);
if (!$split) { # additionally check the corresponding version links
my $path1 = sprintf("$basedir/version-%09d-$from1", $nr1);
my $path2 = sprintf("$basedir/version-%09d-$from2", $nr2);
if (my $vers1 = get_link($path1, 1) and my $vers2 = get_link($path2, 1)) {
if ($vers1 ne $vers2) {
$split = 1;
$total1 += $len1;
$total2 += $len2;
}
}
}
return ($p1, $split, $total1, $total2);
} elsif ($nr1 > $nr2) {
# just flip arguments
@_ = ($basedir, $pos2, $host2, $dep2, $pos1, $host1, $dep1);
($total1, $total2) = ($total2, $total1);
next;
} elsif ($nr1 < $nr2) {
# recursively advance path depth
$total2 += $len2;
my $vers2 = _get_prev_pos($basedir, $nr2, $host2);
return ("", -1, $total1, $total2) if !$vers2;
@_ = ($basedir, $pos1, $host1, $dep1, $vers2, $host2, $dep2 + 1);
next;
} elsif ($from1 ne $from2) {
# split brain is sure now, but continue computing the common split point
my $vers1 = _get_prev_pos($basedir, $nr1, $host1);
return ("", 1, $total1 + $len1, $total2 + $len2) if !$vers1;
my $vers2 = _get_prev_pos($basedir, $nr2, $host2);
return ("", 1, $total1 + $len1, $total2 + $len2) if !$vers2;
my ($res, $split, $t1, $t2) = _get_common_ancestor($basedir, $vers1, $host1, $dep1 + 1, $vers2, $host2, $dep2 + 1);
return ($res, 1, $t1 + $total1 + $len1, $t2 + $total2 + $len2);
} elsif ($len1 < $len2) {
# there may be no split brain (just incomplete replay) depending on path depth
return ($p1, $dep1, $total1 + $len1, $total2 + $len2);
} elsif ($len2 < $len1) {
# dto symmetric
return ($p2, $dep2, $total1 + $len1, $total2 + $len2);
}
lwarn "error in algorithm: $p1, $nr1, $from1, $len1 : $p2, $nr2, $from2, $len2\n";
return ("", -1, 0, 0);
}
}
sub get_common_ancestor {
my ($basedir, $host1, $host2) = @_;
my $repl1 = get_link("$basedir/replay-$host1", 1);
my $repl2 = get_link("$basedir/replay-$host2", 1);
return _get_common_ancestor($basedir, $repl1, $host1, 0, $repl2, $host2, 0);
}
my %detected_splits = ();
sub detect_splitbrain {
my ($res, $do_report) = @_;
# dynamic programming
return $detected_splits{$res} if defined($detected_splits{$res});
my $basedir = "$mars/resource-$res";
my $ok = 1;
my @list = lamport_glob("$mars/resource-$res/replay-*");
my @hosts = map { $_ =~ s:.*/replay-::; $_ } @list;
foreach my $host1 (@hosts) {
foreach my $host2 (@hosts) {
next if $host1 ge $host2;
my ($point, $split, $size1, $size2);
for (my $retry = 2; $retry > 0; $retry--) {
($point, $split, $size1, $size2) = get_common_ancestor($basedir, $host1, $host2);
last if !$split;
# Workaround races.
sleep(0);
}
if ($split) {
$ok = 0;
if ($do_report) {
my $age = "";
if ($point) {
my $log = "$basedir/$point";
$log =~ s:,.+::;
my $stamp = get_link_stamp($log);
my $vers = $log;
$vers =~ s:/log-:/version-:;
my $stamp2 = get_link_stamp($vers);
# take the minimum
$stamp = $stamp2 if !$stamp || ($stamp2 && $stamp2 < $stamp);
$age = " age ~" . seconds2human(mars_time() - $stamp) if $stamp;
}
lwarn "SPLIT BRAIN of resource '$res' after logfile '$point'$age\n";
if ($point) {
lwarn " hostA = '$host1' logfile_amount='$size1' (" . number2human($size1) . ")\n";
lwarn " hostB = '$host2' logfile_amount='$size2' (" . number2human($size2) . ")\n";
}
} else {
return $ok;
}
}
}
}
if ($ok) { # check for duplicate logfiles
my @logs = lamport_glob("$mars/resource-$res/log-*");
my $oldnr = -1;
foreach my $path (sort alphanum_cmp @logs) {
$path =~ m:/log-([0-9]+):;
my $nr = $1;
if ($nr == $oldnr) {
$ok = 0;
lwarn "SPLIT BRAIN at resource '$res' detected: duplicate logfile number $nr\n";
lwarn "hint: first resolve split brain by 'leave-resource' or 'invalidate'\n";
lwarn "hint: if this does not help, try cleanup via 'log-purge-all'\n";
lwarn "hint: if this does not help, try 'log-purge-all --force'\n";
last;
}
$oldnr = $nr;
}
}
$detected_splits{$res} = $ok;
return $ok;
}
sub _mark_path_backward {
my ($basedir, $pos, $peer, $skip, $jump_peer) = @_;
my $sum = 0;
my $base_nr = 0;
for (;;) {
my ($p, $nr, $from, $len) = _parse_pos($basedir, $pos);
last if defined($skip) && $nr < $skip;
$base_nr = $nr;
_visit($nr, $peer);
# When following chains from foreign hosts (e.g. the designated primary),
# we must jump over to our own chain somewhen, because the lengths of
# the chains may be different (caused by invalidate & friends).
if (defined($jump_peer) && $jump_peer ne $peer) {
my $peer_path = sprintf("$basedir/version-%09d-$peer", $nr);
my $jump_path = sprintf("$basedir/version-%09d-$jump_peer", $nr);
my $peer_version = get_link($peer_path, 2);
my $jump_version = get_link($jump_path, 2);
if (defined($peer_version) && defined($jump_version) && $peer_version eq $jump_version) {
$peer = $jump_peer;
}
}
$pos = _get_prev_pos($basedir, $nr, $peer, 1);
last if !$pos;
# optionally don't count the last versionlink, pointing into nirvana
if (defined($skip) && $skip && $nr > 1) {
my ($p, $nr, $from, $len) = _parse_pos($basedir, $pos);
last if !$p;
my $next = _get_prev_pos($basedir, $nr, $peer, 1);
last if !$next;
}
$sum += $len;
}
return ($sum, $base_nr);
}
sub _mark_path_forward {
my ($basedir, $pos, $peer) = @_;
my @list = ($pos);
while (@list) {
my %next_list;
foreach $pos (@list) {
my ($p, $nr, $from, $len) = _parse_pos($basedir, $pos);
my $cand = sprintf("$basedir/version-%09d-$peer", $nr + 1);
my $vers = get_link($cand, 2);
next unless defined($vers) && $vers ne "";
$vers =~ s/^.*://;
my ($cp, $cnr, $cfrom, $clen) = _parse_pos($basedir, $vers);
if (int($cnr) == int($nr) && $cfrom eq $from && $clen == $len) {
$next_list{$cand} = 1;
}
}
@list = keys(%next_list);
}
}
sub _mark_path_transitive {
_mark_path_forward(@_);
_mark_path_backward(@_);
}
sub log_purge_res {
my ($cmd, $res) = @_;
lwarn "DANGEROUS OPERATION: $cmd --force on resource '$res'\n" if $force;
%visited_pos = ();
my %start_logs;
my $start_count = 0;
my $basedir = "$mars/resource-$res";
foreach my $data (lamport_glob("$basedir/{data,replay}-*")) {
$data =~ m:/(data|replay)-(.+):;
my $peer = $2;
my $replay = "$basedir/replay-$peer";
my $target = get_link($replay, 1);
lprint "found replay link '$replay' -> '$target'\n";
$target =~ s/,.*//;
$start_logs{$target}++;
$start_count++;
_mark_path_transitive($basedir, $target, $peer);
}
if (!$start_count) {
lprint "Resource contains no valid information - there is nothing to purge\n";
return;
}
my %logs;
foreach my $file (lamport_glob("$basedir/version-*")) {
$file =~ m:/(version-([0-9]+)-([^,]+)): or ldie "bad path '$file'\n";
my $cand = $1;
my $nr = $2;
my $from = $3;
lprint "checking '$cand'\n";
my $vers = get_link($file, 1);
$vers =~ m/(log-[0-9]+-[^,:]+)/;
my $log = $1;
lprint " corresponding logfile is '$log'\n";
if (_is_visited($nr, $from)) {
lprint " ok '$cand'\n";
$logs{$log}++;
next;
}
if (!$force && $from ne $host) {
lprint " skipping foreign object '$cand'\n";
$logs{$log}++;
next;
}
lwarn "deleting foreign object from peer '$from' because you said --force\n" if $from ne $host;
_create_delete($file);
}
foreach my $file (lamport_glob("$basedir/log-*")) {
$file =~ m:/(log-[0-9]+-(.*)): or ldie "bad path '$file'\n";
my $log = $1;
my $from = $2;
lprint "checking '$log'\n";
if ($logs{$log}) {
lprint " ok '$log'\n";
$logs{$log} = -1;
next;
}
if ($start_logs{$log}) {
lprint " ok start '$log'\n";
$logs{$log} = -1;
next;
}
if (!$force && $from ne $host) {
lprint " skipping foreign object '$log'\n";
next;
}
lwarn "deleting foreign object from peer '$from' because you said --force\n" if $from ne $host;
_create_delete($file);
}
my $count = 0;
foreach my $log (sort alphanum_cmp keys(%logs)) {
my $nr = $logs{$log};
next if $nr < 0 || any_exists("$basedir/$log");
lprint_stderr "info: logfile '$log' is referenced ($nr), but not present.\n";
$count++;
}
if ($count) {
lprint_stderr " Unreferenced logfiles are not necessarily bad.\n";
lprint_stderr " They can regularly appear after 'leave-resource',\n";
lprint_stderr " or 'invalidate', or after emergency mode,\n";
lprint_stderr " or after similar operations.\n";
}
finish_links();
}
sub err_purge_res {
my ($cmd, $res) = @_;
foreach my $leftlink (lamport_glob("$mars/{,resource-$res/}actual-$host/msg-err-*")) {
lprint " deleting '$leftlink'\n" if $verbose;
_create_delete($leftlink);
}
}
sub try_to_avoid_splitbrain {
my ($cmd, $res, $old_primary) = @_;
my $old_timeout = $timeout;
$old_primary = "" if $old_primary eq "(none)";
if (!detect_splitbrain($res, 0)) {
lwarn "ATTENTION: you are starting a non-forced primary switchover in a split brain situation.\n";
lwarn "ATTENTION: that's no good idea.\n";
lwarn "ATTENTION: I will continue to do what you want.\n";
lwarn "ATTENTION: But you are responsible for the consequences.\n";
return 0;
}
# now try to prevent producing a _new_ split brain situation....
my @host_list = lamport_glob("$mars/resource-$res/replay-*");
return 0 if scalar(@host_list) < 2;
my ($min, $max) = get_minmax_versions($res);
my $vers_glob = sprintf("$mars/resource-$res/version-%09d-*", $max);
my $ok = 1;
my $replay_err_path = "$mars/resource-$res/actual-$host/msg-err-replay-stop";
my $replay_err = get_link($replay_err_path, 1);
if ($replay_err && $replay_err ne "OK") {
my @stat = lstat($replay_err_path);
if ($stat[9] + $window >= mars_time()) {
my $msg = _get_text($replay_err);
lwarn "cannot guarantee split brain avoidance: $msg\n";
return 0;
}
}
my $emergency_path = "$mars/resource-$res/actual-$host/has-emergency";
my $emergency = get_link($emergency_path, 1);
if ($emergency) {
ldie "emergency mode $emergency has been entered locally: handover is not possible. Either free some space in $mars/, or use --force to use a potentially outdated version.\n";
}
my $primary = _get_designated_primary($res, -1);
if ($primary eq "(none)") {
# try to determine the old primary when unique
my $glob_logs = sprintf("$mars/resource-$res/log-%09d-*", $max);
my @candidates = lamport_glob($glob_logs);
if (scalar(@candidates) == 1) {
my $log_path = pop @candidates;
if ($log_path =~ m:/log-[0-9]+-(.+)$:) {
$primary = $1;
lprint "Using last primary '$primary' as a substitute.\n";
}
}
}
if ($primary && $primary eq $host) {
lprint "Switching back to last primary.\n";
return 0;
}
# if the old primary is known, we can ignore all other / unrelated hosts
if ($primary && $primary ne $host && $primary ne "(none)") {
my $p_path = sprintf("$mars/resource-$res/version-%09d-%s", $max, $primary);
my $h_path = sprintf("$mars/resource-$res/version-%09d-%s", $max, $host);
my $p_vers = get_link($p_path, 1);
my $h_vers = get_link($h_path, 1);
if (!$p_vers || !$h_vers || $p_vers ne $h_vers) {
# Safeguard against artificial or missing version links
# (e.g. after crashes etc):
# when replay links are equal, we are safe.
my $path_p = "$mars/resource-$res/replay-$primary";
my $path_h = "$mars/resource-$res/replay-$host";
my $replay_p = get_link($path_p);
my $replay_h = get_link($path_h);
my $stamp_p = get_link_stamp($path_p);
my $stamp_h = get_link_stamp($path_h);
my $p_stamp = get_link_stamp($p_path);
my $stable_stamp = mars_time() - $window / 4;
if ($replay_p eq $replay_h &&
$p_vers &&
$p_stamp < $stable_stamp &&
$stamp_p < $stable_stamp && $stamp_h < $stable_stamp) {
# self-healing the versionlink
lwarn "Correcting the versionlink '$h_path' from '$h_vers' to '$p_vers'\n";
set_link($p_vers, $h_path);
finish_links();
} else {
$ok = 0;
}
}
} else {
# old primary is unknown: we have no chance, other than comparing _all_ versions.
my @versions = lamport_glob($vers_glob);
my $first = get_link(shift @versions);
while (@versions) {
my $next = get_link(shift @versions);
if ($next ne $first) {
$ok = 0;
}
}
}
return 0 if $ok;
lprint "Trying to avoid split brain for $timeout s: logfile update not yet completed.\n";
my $tpl = get_macro("replinfo");
my $new_situation = eval_macro($cmd, $res, $tpl, @_);
print $new_situation;
# condition not met
return 1;
}
sub get_size {
my $arg = shift;
my $orig_arg = $arg;
# Do what I mean: when given a device, take its actual size.
if (-b $arg) {
my $mangled = $arg;
$mangled =~ s:/dev/::;
$mangled =~ s:/:!:g;
my $path = "/sys/block/$mangled/size";
$arg = `cat $path` * 512 if -r $path;
}
if ($arg !~ m/^([0-9]+(?:\.[0-9]*)?)([kmgtp]?)$/i) {
ldie "Size argument '$arg' must be a number, optionally followed by a suffix [kKmMgGtTpP]. Lowercase = multiples of 1000, Uppercase = multiples of 1024.\n";
}
my $mod = $2 || "";
$arg = $1;
$_ = $mod;
SWITCH: {
/^$/ and last SWITCH;
/^k$/ and $arg *= 1000, last SWITCH;
/^m$/ and $arg *= 1000 * 1000, last SWITCH;
/^g$/ and $arg *= 1000 * 1000 * 1000, last SWITCH;
/^t$/ and $arg *= 1000 * 1000 * 1000 * 1000, last SWITCH;
/^p$/ and $arg *= 1000 * 1000 * 1000 * 1000 * 1000, last SWITCH;
/^K$/ and $arg *= 1024, last SWITCH;
/^M$/ and $arg *= 1024 * 1024, last SWITCH;
/^G$/ and $arg *= 1024 * 1024 * 1024, last SWITCH;
/^T$/ and $arg *= 1024 * 1024 * 1024 * 1024, last SWITCH;
/^P$/ and $arg *= 1024 * 1024 * 1024 * 1024 * 1024, last SWITCH;
ldie "bad unit suffix '$mod'";
}
ldie "size argument '$orig_arg' evaluating to '$arg' is not a multiple of 4K = 4096\n" if ($arg % 4096) != 0;
return $arg;
}
# DEPRECATED
#
# TST NOTE: avoid calling this function. As such it is conceptually wrong,
# because during split-brain situations, there exists a _set_ of non-unique
# primaries. I want to remove this function, but I currently can't because
# other internal software at 1&1 is depending on it.
#
# Get actual primary node from links below actual-*/ subdirs
#
sub _get_actual_primary {
my ($res) = @_;
# TST: Presence of local device takes precedence over anything else.
# This tries to workaround the most important special case of
# split-brain situations, but cannot fix the problem exhaustively.
llog "DEPRECATED: you are trying to uniquely identify an actual primary hostname (as seen from host $host resource $res), but this is conceptually wrong because in split-brain situations there may exist multiple ones. Use view-is-primary instead. That would be safe.\n";
my $name = get_link("$mars/resource-$res/device-$real_host", 1);
my $dev = "/dev/mars/$name";
return $real_host if -b $dev;
# The following old code is CONCEPTUALLY WRONG for split-brain situations (see NOTE above)
my @primary_links = lamport_glob("$mars/resource-$res/actual-*/is-primary");
my $primary;
foreach my $link (@primary_links) {
if (my $val = get_link($link)) {
$primary = ($link =~ qr%.*actual-([^/]+)/is-primary%)[0];
last;
# Note: if there are more than one 'is-primary' links (an insane state anyway),
# the first 'is-primary' link is selected. Other links are ignored.
}
}
return $primary;
}
my %old_primary;
sub _get_designated_primary {
my ($res, $unchecked) = @_;
my $fallback_to_old = 0;
if (defined($unchecked) && $unchecked == -1) {
$fallback_to_old = 1;
$unchecked = 1;
}
my $val = get_link("$mars/resource-$res/primary", $unchecked);
if (!defined($val) || !$val || $val eq "(none)") {
if ($fallback_to_old) {
$val = $old_primary{$res} if defined($old_primary{$res});
if (!defined($val) || !$val || $val eq "(none)") {
my $last_lnk = "$mars/resource-$res/userspace/last-primary";
$val = get_link($last_lnk, 2);
}
}
} else {
$old_primary{$res} = $val;
}
return $val;
}
sub get_peers {
my ($res) = @_;
my @list = lamport_glob("$mars/resource-$res/data-*");
return map { $_ =~ s:$mars/resource-$res/data-::; $_ } @list;
}
sub __conv_tv {
my ($tv_sec, $tv_nsec) = @_;
if (defined($tv_nsec)) {
$tv_nsec = ".$tv_nsec";
} else {
$tv_nsec = "";
}
my ($sec,$min,$hour,$mday,$mon,$year,$wday,$yday,$isdst) = localtime(int($tv_sec));
return "$tv_sec$tv_nsec" unless defined($sec);
return sprintf("%04d-%02d-%02d %02d:%02d:%02d%s", $year+1900, $mon + 1, $mday, $hour, $min, $sec, $tv_nsec);
}
sub __conv_errno {
my ($txt, $error) = @_;
return "$txt$error" if !defined($error) || ($error <= 0);
$! = $error;
my $res = "${txt}${error}[$!]";
$! = 0;
return $res;
}
sub _replace_timestamps {
my ($txt, $omit_nsec) = @_;
if (defined($omit_nsec) && $omit_nsec) {
$txt =~ s:([0-9]{9,99})\.([0-9]{9}):__conv_tv($1):ge;
} else {
$txt =~ s:([0-9]{9,99})\.([0-9]{9}):__conv_tv($1,$2):ge;
}
$txt =~ s:((error|status)\s*=?\s*-)([0-9]+):__conv_errno($1,$3):ge;
return $txt;
}
sub _get_text {
my ($glob, $regex, $do_print, $get_count) = @_;
my $text = "";
my $count = 0;
foreach my $path (lamport_glob($glob)) {
open(IN, "<", $path) or next;
while (my $line = <IN>) {
# use regex e.g. for fetching only errors and warnings
if (!$regex || $line =~ $regex) {
$line = _replace_timestamps($line);
$count++;
if ($do_print) {
print $line;
llog($line);
} else {
$text .= $line;
}
}
}
close(IN);
}
return $count if defined($get_count) && $get_count;
return $text;
}
my %global_error_texts;
sub get_error_text {
my ($cmd, $res) = @_;
my $path = "$mars/resource-$res/logstatus-$host.status";
if (defined($global_error_texts{$path})) {
return $global_error_texts{$path};
}
my $text = _get_text($path, "m/^(err|warn)/i", 0);
$global_error_texts{$path} = $text;
return $text;
}
##################################################################
# helpers
sub _trigger{
my $code = shift || 1;
system("(echo $code > /proc/sys/mars/trigger) >/dev/null 2>&1");
}
sub _switch {
my ($cmd, $res, $path, $on) = @_;
my $src = $on ? "1" : "0";
$path = correct_path($path);
my $old = get_link($path);
if ($old && $old eq $src) {
lprint "${cmd} on resource $res is already activated\n" if $cmd;
return;
}
set_link($src, $path);
lprint "successfully started ${cmd} on resource $res\n" if $cmd;
}
sub _writable {
my ($path, $on) = @_;
my $oldmode = (lstat $path)[2] & 0700;
my $newmode = $on ? $oldmode | 0200 : $oldmode & ~0200;
lprint "chmod '$path' $oldmode $newmode";
chmod($newmode, $path) == 1 or ldie "cannot chmod '$path'\n";
}
sub _get_ip {
check_id($host);
my $ip_path = "$mars/ips/ip-$host";
if (my $from_link = get_link($ip_path, 2)) {
lprint_stderr "Using IP '$from_link' from '$ip_path'\n" if $verbose;
return $from_link;
}
chomp (my @info = `/sbin/ip addr show`);
my $interface = "";
foreach my $line (@info) {
$interface = $1 if $line =~ m#^[0-9]+:\s([a-zA-Z_0-9]+):#;
next if $interface eq "lo";
if ($line =~ m#\sinet\s(\d+\.\d+\.\d+\.\d+)#) {
my $from_if = $1;
lprint_stderr "Using IP '$from_if' from interface '$interface'\n";
return $from_if;
}
}
return undef;
}
sub _fake_versionlink {
my ($basedir, $log_nr, $primary) = @_;
my $make_count = 0;
for (my $rounds = 2; $rounds > 0 && $log_nr > 0; $rounds--) {
my $new_version = sprintf("$basedir/version-%09d-$host", $log_nr);
my $pri_version = sprintf("$basedir/version-%09d-$primary", $log_nr);
if ($primary eq $host) {
ldie "Cannot fake my own version link '$new_version'\n";
}
my $pri_link = get_link($pri_version, 1);
if ($pri_link) {
lprint "creating new version symlink '$new_version' -> '$pri_link'\n";
set_link($pri_link, $new_version);
$make_count++;
} elsif ($rounds == 2) {
lwarn "cannot create new version symlink '$new_version'\n";
} else {
lprint "cannot create new version symlink '$new_version'\n";
}
$log_nr--;
}
lwarn "cannot create faked versionlink\n" if !$make_count;
}
sub _set_replaylink {
my ($basedir, $log_nr, $primary, $msg) = @_;
$msg = " -- THIS IS EXTREMELY RISKY -- any inconsistencies are on your own!" unless defined($msg);
ldie "no designated primary defined\n" unless ($primary && $primary ne "(none)");
my $rep_path = "$basedir/replay-$host";
my $rep_val = sprintf("log-%09d-$primary,0,0", $log_nr);
lprint "creating new replaylink '$rep_path' -> '$rep_val'\n";
set_link($rep_val, $rep_path);
if ($log_nr > 1) {
_fake_versionlink($basedir, $log_nr - 1, $primary);
} else {
my $initial;
for (;;) {
$initial = get_link("$basedir/version-000000001-$primary", 1);
last if $initial;
sleep_timeout();
}
set_link($initial, "$basedir/version-000000001-$host");
}
set_link("$log_nr$msg", "$basedir/skip-check-$host");
}
##################################################################
# lowlevel tools
sub lowlevel_ls_host_ips {
my ($cmd) = @_;
for my $path (lamport_glob("$mars/ips/ip-*")) {
$path =~ m:/ip-(.*):;
my $peer = $1;
my $ip = get_link($path, 1);
lprint "$peer $ip\n";
}
}
sub lowlevel_set_host_ip {
my ($cmd, $peer, $ip) = @_;
check_id($peer);
my $path = "$mars/ips/ip-$peer";
my $old = get_link($path, 2) || "";
lprint "Set host '$peer' IP from '$old' to '$ip'\n";
set_link($ip, $path);
}
sub lowlevel_delete_host {
my ($cmd, $peer) = @_;
check_id($peer, 1);
my $path = "$mars/ips/ip-$peer";
my $old = get_link($path);
lprint "Removing host '$peer' old IP '$old'\n";
_create_delete($path);
}
##################################################################
# commands
sub ignore_cmd {
my ($cmd, $res) = @_;
lprint "ignoring command '$cmd' on resource '$res'\n";
}
sub senseless_cmd {
my ($cmd, $res) = @_;
lprint "command '$cmd' makes no sense with MARS (ignoring on resource '$res')\n";
}
sub forbidden_cmd {
my ($cmd, $res) = @_;
ldie "command '$cmd' on resource '$res' cannot be used with MARS (migth affect too many hosts, lead to undesired consequences)\n";
}
sub nyi_cmd {
my ($cmd, $res) = @_;
ldie "command '$cmd' on resource '$res' is not yet implemented\n";
}
sub is_module_loaded {
return -d "/proc/sys/mars";
}
sub set_connect_pref_list {
my ($cmd, $res, $list) = @_;
check_res_member($cmd, $res);
my $dst = "$mars/resource-$res/connect-$host";
if ($cmd =~ m/^get-/) {
my $value = get_link($dst);
lprint "$value\n";
return;
}
check_id_list($list, 1);
set_link($list, $dst);
}
sub emergency_limit_res {
my ($cmd, $res, $value) = @_;
my $dst = "$mars/resource-$res/todo-$host/emergency-limit";
if ($cmd =~ m/^get-/) {
my $value = get_link($dst);
lprint "$value\n";
return;
}
ldie "percent argument '$value' isn't numeric\n" unless $value =~ m/^[0-9]+$/;
ldie "percent argument '$value' isn't between 0 and 100%\n" unless ($value >= 0 && $value <= 100);
set_link($value, $dst);
}
sub set_link_cmd {
my $cmd = shift;
for (;;) {
my $src = shift || last;
if ($cmd =~ m/^get-/) {
my $value = get_link($src);
lprint "$value\n";
next;
}
my $dst = shift || ldie "you did not supply a symlink destination for source '$src'\n";
ldie "symlink target '$dst' is not an absolute path\n" unless $dst =~ m:^/:;
check_userspace($dst);
my $dir = `dirname "$dst"` or ldie "path '$dst' has no dirname\n";
chomp $dir;
ldie "directory '$dir' does not exist\n" unless -d $dir;
set_link($src, $dst);
}
}
sub set_sync_limit_value {
my ($cmd, $value) = @_;
my $todo_dir = "$mars/defaults-$host";
ldie "directory '$todo_dir' does not exist\n" unless -d $todo_dir;
my $dst = "$todo_dir/sync-limit";
if ($cmd =~ m/^get-/) {
my $value = get_link($dst);
lprint "$value\n";
return;
}
set_link($value, $dst);
}
sub create_uuid {
my ($cmd) = @_;
my $old_uuid = get_link("$mars/uuid", 2);
ldie "Cluster was already created with uuid='$old_uuid'. " .
"For safety reasons, no override is possible at marsadm level.\n" if $old_uuid;
my $uuid = `echo -n \$(hostname) \$(date)`;
set_link($uuid, "$mars/uuid");
finish_links(); # opportunity for errors => don't continue
lprint "New cluster UUID is '$uuid'\n";
}
sub _create_cluster {
my ($cmd) = @_;
ldie "The $mars directory does not exist.\n" unless -d $mars;
create_uuid(@_) if $cmd eq "create-cluster";
system("mkdir $mars/ips") unless -d "$mars/ips";
system("mkdir $mars/userspace") unless -d "$mars/userspace";
system("mkdir $mars/defaults") unless -d "$mars/defaults";
system("mkdir $mars/defaults-$host") unless -d "$mars/defaults-$host";
set_link("0", "$mars/defaults-$host/sync-limit");
set_link("(none)", "$mars/defaults-$host/sync-pref-list");
system("mkdir $mars/todo-global") unless -d "$mars/todo-global";
mkdir("$mars/actual-$host") unless -d "$mars/actual-$host";
set_link($ip, "$mars/ips/ip-$host");
set_link("1", "$mars/todo-global/deleted-$host");
}
sub create_cluster {
my ($cmd, $peer) = @_;
ldie "cluster is already created\n" if !$force && -d "$mars/ips";
ldie "mars module is loaded, please unload first\n" if is_module_loaded();
_create_cluster(@_);
}
sub join_cluster {
my ($cmd, $peer) = @_;
ldie "Cannot join myself (peer='$peer', host='$host')\n" if $peer eq $host;
ldie "Directory $mars is missing\n" unless -d $mars;
ldie "A valid tree signature '$mars/tree-$host' already exists, thus it appears you are already a cluster member! This cannot be overridden, other by using a freshly created /mars/ filesystem.\n" if link_exists("$mars/tree-$host");
ldie "A cluster UUD '$mars/uuid' already exists, thus it appears you are already a cluster member! This cannot be overridden, other by using a freshly created /mars/ filesystem.\n" if link_exists("$mars/uuid");
if (lamport_glob("$mars/resource-*") or lamport_glob("$mars/ips/*")) {
ldie "Sorry, some resources already exist!\nThis is dangerous!\nIf you are sure that no resource clash is possible, re-invoke this command with '--force' option\n" unless $force;
}
ldie "mars module is loaded, please unload first\n" if is_module_loaded();
lprint "joining cluster via rsync (peer='$peer')\n";
rsync_cmd($peer, "--max-size=1 --update $peer:$mars/ $mars/");
# check uniqness of IPs
my %ips = ();
foreach my $other_ip_path (lamport_glob("$mars/ips/*")) {
my $other_ip = get_link($other_ip_path, 1);
my $other_host = $other_ip_path;
$other_host =~ s:.*/ip-::;
lwarn "Host '$other_host' IP '$other_ip' is AMBIGUOUS\n" if defined($ips{$other_host});
$ips{$other_host} = $other_ip;
lwarn "New IP '$ip' already exists for host '$host' at '$other_ip_path'\n" if ($ip eq $other_ip && $other_host eq $host);
}
_create_cluster(@_);
finish_links();
rsync_cmd($peer, "--update $mars/ips/ $peer:$mars/ips/");
}
sub merge_cluster {
my ($cmd, $peer) = @_;
my $uuid = readlink("$mars/uuid");
my @resources = lamport_glob("$mars/resource-*");
my @ip_links = lamport_glob("$mars/ips/*");
if ($cmd =~ m/-list/) {
print "UUID: $uuid\n";
print "IPs:\n";
foreach my $ip (@ip_links) {
print "$ip\n";
}
print "RESOURCEs:\n";
foreach my $i (@resources) {
print "$i\n";
}
return;
}
ldie "No peer argument given" unless $peer;
ldie "Cannot merge myself (peer='$peer', host='$host')\n" if $peer eq $host;
ldie "Directory $mars is missing\n" unless -d $mars;
ldie "A cluster UUD '$mars/uuid' does not exist. Please use 'join-cluster instead.\n" unless link_exists("$mars/uuid");
# check connections
my $check_cmd = "uname -a";
system("$check_cmd") == 0 or ldie "oops, 'uname is not installed'\n";
system("rsync --help > /dev/null") == 0 or ldie "Command 'rsync' is not installed\n";
ssh_cmd($peer, $ssh_probe);
my @old_peers;
foreach my $ip (@ip_links) {
my $old_peer = $ip;
$old_peer =~ s:^.+/ip-::;
next if $old_peer eq $host;
next if $old_peer eq $real_host;
ssh_cmd($old_peer, $ssh_probe);
push @old_peers, $old_peer;
}
# check whether merge-cluster is possible
my %total_res;
foreach my $res (@resources) {
$total_res{$res}++;
}
my $ssh_cmd = make_ssh_cmd($peer) . " marsadm merge-cluster-list";
my $answer = `$ssh_cmd`;
$answer =~ m/^UUID: (.*)$/m or ldie "cannot determine remote UUID from '$answer'\n";
my $other_uuid = $1;
ldie "Other cluster peer '$peer' has no UUID\n" unless $other_uuid;
if ($other_uuid eq $uuid) {
lprint "Other cluster peer '$peer' has the same UUID.\n";
lprint "No resource name checking necessary.\n";
lprint "Operation '$cmd' will therfore work logically idempotent.\n";
} else {
if (link_exists("$mars/tree-$peer")) {
lwarn "A valid tree signature '$mars/tree-$peer' already exists, thus it appears to be already merged!\n";
ldie "Aborting for saftey. Override via --force only if you know what you are doing!\n" unless $force;
}
# Check that both sets of resources are disjoint
lprint "Other cluster peer '$peer' has a different UUID, checking for resource name conflicts.\n";
my @other_resources;
my @conflicts;
my $copy = $answer;
$copy =~ s/\A.*?RESOURCEs:\n//ms;
while ($copy) {
$copy =~ s/\A(.*)\n$//m;
my $other_res = $1;
last unless $other_res;
push @other_resources, $other_res;
if ($total_res{$other_res}++) {
push @conflicts, $other_res;
}
}
if (@conflicts) {
lprint "CONFLICTS:\n";
foreach my $res (@conflicts) {
lprint "\t$res\n";
}
ldie "Cannot $cmd: some resource directories exist at both clusters with same name.\nThis cannot be overriden.\nPlease resolve the conflict by hand.\n";
}
lprint "List of total resources:\n";
foreach my $res (keys(%total_res)) {
lprint "\t$res\n";
}
# INTERNAL, for debugging and error analysis: backup the old uuid symlink
my $backup = "$backup_dir/uuid-backups";
system("mkdir -p $backup; cp -a $mars/uuid $backup/") unless link_exists("$backup/uuid");
}
# Start the "hot phase"
my $rsync_cmd = "--max-size=1";
rsync_cmd($peer, "$rsync_cmd $peer:$mars/uuid $mars/uuid");
foreach my $old_peer (@old_peers) {
rsync_cmd($old_peer, "$rsync_cmd $mars/uuid $old_peer:$mars/uuid");
}
$rsync_cmd .= " --update --ignore-existing";
rsync_cmd($peer, "$rsync_cmd $peer:$mars/ $mars/");
rsync_cmd($peer, "$rsync_cmd $mars/ $peer:$mars/");
}
sub leave_cluster {
my ($cmd) = @_;
ldie "mars kernel module is not loaded. This is needed for communication with some other hosts!\n" if !is_module_loaded();
my $check = "/mars/resource-*/data-$host";
ldie "I am member of some resources\n" if lamport_glob($check) && !$force;
wait_cluster($cmd) unless $force;
foreach my $path (lamport_glob("$mars/actual-*/*-$host")) {
_create_delete($path);
}
_create_delete("$mars/ips/ip-$host");
finish_links();
wait_cluster($cmd) unless $force;
foreach my $path (lamport_glob("$mars/actual-*/*-$host")) {
_create_delete($path);
}
_create_delete("$mars/todo-global/deleted-$host");
finish_links();
wait_cluster($cmd) unless $force;
while (-f "$mars/ips/ip-$host" && $timeout_val >= 0) {
sleep_timeout(3, 1);
}
system("rmmod mars") if $host eq $real_host;
foreach my $path (lamport_glob("$mars/{,resource-*/}{todo,actual}-*/*-$host")) {
unlink($path);
}
foreach my $path (lamport_glob("$mars/{,resource-*/}{todo,actual}-$host")) {
rmdir($path); # at least try it
}
}
sub create_res {
my ($cmd, $res, $dev, $appear, $size_arg) = @_;
my $create = ($cmd eq "create-resource");
ldie "undefined device or size argument\n" unless $dev;
$appear = $res if !$appear;
check_id($appear) if $appear;
my $resdir = "$mars/resource-$res";
if ($create) {
if (-d $resdir) {
lwarn "resource directory '$res' already exists\n";
my @host_list = lamport_glob("$resdir/replay-*");
if (@host_list) {
my $h_list = join(',', map({ $_ =~ s:.*/replay-::; $_; } (@host_list)));
lwarn "DANGER: hosts '$h_list' are already member of resource '$res'.\n";
ldie "REFUSING to trash your resource!\n" unless $force;
}
}
lprint "creating new resource '$res'\n";
} else {
if (link_exists("$resdir/data-$host")) {
lwarn "resource '$res' has been already joined -- this is dangerous!\n";
ldie "refusing dangerous operation\n" unless $force;
} else {
lprint "joining to existing resource '$res'\n";
}
}
my $size = 0;
if (-b $dev) {
ldie "block device '$dev' must be an absolute path starting with '/'\n" unless $dev =~ m/^\//;
use Fcntl 'SEEK_END', 'O_RDONLY', 'O_RDWR', 'O_EXCL';
my $flags = O_RDWR | O_EXCL;
if ($force) {
$flags = O_RDONLY;
}
sysopen(TEST, $dev, $flags) or ldie "cannot open device '$dev' for exclusive rw access\n";
$size = sysseek(TEST, 0, SEEK_END);
close(TEST);
lprint "block device '$dev': determined size = $size bytes\n";
if ($size_arg) {
my $new_size = get_size($size_arg);
ldie "size argument '$size_arg' is smaller than device size '$size'\n" unless $new_size <= $size;
lprint "reducing size from $size to $new_size\n";
$size = $new_size;
}
} elsif (-f $dev) {
lprint "Using regular (sparse) file '$dev' as an image file\n";
$size = get_size($dev);
if ($size > 0) {
$dev = "";
} else {
ldie "bad parameter '$dev': must be either a block device name, or size followed by k or m or g or t\n";
}
} else {
ldie "Device argument '$dev' does not exist or is no usable block device.\n"
}
ldie "implausible size $size" unless $size > 4096 * 16; # smaller floppies should not exist ;)
err_purge_res($cmd, $res);
my $primary;
my $replay_nr = -1;
my $use_rsync = 0;
if ($create) {
mkdir($resdir);
ldie "could not create resource '$res'\n" unless -d $resdir;
set_link($size, "$resdir/size");
} else { # join
# For safety, try to get the very newest infos.
# Reason: newer kernel modules will fetch non-member resource infos less frequently.
# Therefore we shift some responsibility for non-member -> member transitions to userspace.
my @peers = map { m:^$mars/ips/ip-(.+):; $1 } lamport_glob("$mars/ips/ip-*");
for (my $retry = 0; $retry < 3; $retry++) {
$primary = _get_designated_primary($res, 1);
last if (defined($primary) && $primary);
wait_cluster($cmd, $res, "*", 0, 8);
}
my $max_retry = 5;
for (;;) {
if ($primary && $primary ne "(none)") {
# first check whether symlink information appears to be recent enough
last if recent_cluster($cmd, $res, $primary);
# for safety, fetch newest infos from last known primary
$use_rsync++;
rsync_cmd($primary, "--max-size=1 --update $primary:$mars/resource-$res/ $mars/resource-$res/", $res, 1);
last;
}
if (!@peers || $max_retry < 0) {
lwarn "cannot get a relevant primary\n";
last;
}
# We are desperate. Try to fetch the directory from anywhere.
my $peer = shift @peers;
next if $peer eq $host;
next if $peer eq $real_host;
$use_rsync++;
rsync_cmd($peer, "--max-size=1 --update $peer:$mars/resource-$res $mars/", 1);
$primary = _get_designated_primary($res, 1);
$max_retry--;
} # retry loop
ldie "resource '$res' does not exist\n" unless -d $resdir;
ldie "resource '$res' has no designated primary\n" unless $primary && $primary ne "(none)";
finish_links();
wait_cluster($cmd, $res);
my $res_size = get_link("$mars/resource-$res/size", 1);
if ($size < $res_size) {
lwarn "size of new device is only $size, but should be $res_size\n";
ldie "refusing to join due to bad size\n" unless $force;
} elsif ($size > $res_size) {
lprint "Your physical device has size $size, which is larger than the logical resource size $res_size.\n";
lprint "This does no harm, but you are wasting some space.\n";
}
$primary = _get_designated_primary($res);
ldie "Sorry, joining is only possible if a designated primary exists.\n" if $primary eq "(none)";
ldie "implausible state: I ($host) am already designated primary of resource '$res' which I just wanted to join\n" if $primary eq $host;
ldie "my ip '$ip' is not registered -- please run 'join-cluster' first\n" unless link_exists("$mars/ips/ip-$host");
check_sync_startable(@_);
my $replay = get_link("$resdir/replay-$primary");
if ($replay =~ m/^log-([0-9]+)-/) {
$replay_nr = $1;
} else {
ldie "cannot determine current logfile number.\n";
}
}
# check for uniqeness of $appear
if ($appear) {
foreach my $old_dev (lamport_glob("$mars/resource-*/device-$host")) {
$old_dev =~ m:/resource-([^/]+)/:;
next unless defined($1);
my $old_res = $1;
next if $old_res eq $res;
my $old_name = get_link($old_dev);
if ($old_name eq $appear) {
if (link_exists("$mars/resource-$old_res/data-$host")) {
ldie "device '/dev/mars/$old_name' is already present in joined resource '$old_res'\n";
} else {
lwarn "device '/dev/mars/$old_name' is already present in another unjoined resource '$old_res' -- this does no harm, but may be confusing.\n";
}
}
}
# warn if devices are named differently throughout the cluster
foreach my $old_dev (lamport_glob("$resdir/device-*")) {
my $old_name = get_link($old_dev);
if ($old_name ne $appear) {
$old_dev =~ m:/device-(.+)$:;
my $old_host = $1;
lwarn "your name '/dev/mars/$appear' differs from '/dev/mars/$old_name' on host '$old_host'.";
lwarn "this does no harm, but may be confusing.";
}
}
}
# purge before complaining
log_purge_res($cmd, $res);
err_purge_res($cmd, $res);
finish_links();
_wait_delete();
# check for remains from former incidents
my $max_retry = 3;
while ($max_retry-- > 0) {
my @remains = ();
foreach my $remain (lamport_glob("$resdir/{log,version}-*-$host")) {
if (!$create) {
$remain =~ m/-([0-9]+)-$host/;
my $nr = $1;
next if $nr < $replay_nr;
}
lwarn "OLD REMAIN '$remain'\n";
push @remains, $remain;
}
last if !@remains;
lwarn "Resource directory has some old remains.\n";
if ($force && !$create) {
foreach my $remain (@remains) {
lwarn "REMOVING remain '$remain'";
_create_delete($remain);
}
finish_links();
_wait_delete();
next;
}
lwarn "First cleanup.\n";
my $extra_op = $create ? "delete-resource" : "log-purge-all";
lwarn "If you really want to $cmd, run 'marsadm $extra_op' first.\n";
ldie "REFUSING $cmd\n" unless $force;
}
my $file = "$resdir/data-$host";
if (!$dev) {
lwarn "link '$file' already exists - reusing\n" if link_exists($file);
lprint "setup sparse file '$file' with size $size\n";
open(OUT, ">>", $file) or ldie "could not open '$file'\n";
truncate(OUT, $size) or ldie "truncate to size $size failed\n";
close OUT;
} else {
lprint "using existing device '$dev'\n";
set_link($dev, $file);
}
if ($appear) {
lprint "resource '$res' will appear as local device '/dev/mars/$appear'\n";
set_link($appear, "$resdir/device-$host");
}
mkdir("$resdir/userspace") unless -d "$resdir/userspace";
mkdir("$resdir/defaults") unless -d "$resdir/defaults";
mkdir("$resdir/defaults-$host");
mkdir("$resdir/local-$host");
mkdir("$resdir/actual-$host");
my $todo = "$resdir/todo-$host";
mkdir($todo);
set_link("0", "$todo/attach");
set_link("1", "$todo/connect");
set_link("1", "$todo/sync");
set_link("1", "$todo/allow-replay");
unlink("$resdir/syncstatus-$host");
if ($create) {
set_link($host, "$resdir/primary");
set_link($host, "$resdir/userspace/last-primary");
set_link($size, "$resdir/syncstatus-$host");
my $startnr = get_link("$resdir/maxnr", 2);
if (defined($startnr) && $startnr ne "" && $startnr > 0) {
$startnr += 100000;
} else {
$startnr = 1;
}
my $fmt_old = sprintf("%09d", $startnr - 1);
my $fmt = sprintf("%09d", $startnr);
set_link("log-$fmt-$host,0,0", "$resdir/replay-$host");
system("touch $resdir/log-$fmt-$host") unless $dry_run;
my $old_fake = "00000000000000000000000000000000,log-$fmt_old-$host,0";
set_link("$old_fake:", "$resdir/version-$fmt_old-$host");
set_link("00000000000000000000000000000000,log-$fmt-$host,0:$old_fake", "$resdir/version-$fmt-$host");
set_link("$startnr", "$resdir/skip-check-$host") if $startnr > 1;
set_link("$startnr", "$resdir/maxnr");
my $want_path = "$resdir/systemd-want";
set_link($host, $want_path);
finish_links();
lprint "successfully created resource '$res'\n";
} else { # join
_set_replaylink($resdir, $replay_nr, $primary, "");
set_link("0", "$resdir/syncstatus-$host");
finish_links();
if ($use_rsync) {
rsync_cmd($primary, "--max-size=1 --update $file $primary:$mars/resource-$res/", 1);
}
# Re-read the primary replaylink because it might have log-rotated in the meantime
wait_cluster($cmd, $res, $primary);
my $replay = get_link("$resdir/replay-$primary");
if ($replay =~ m/^log-([0-9]+)-/) {
$replay_nr = $1;
_set_replaylink($resdir, $replay_nr, $primary, "");
}
lprint "successfully joined resource '$res'\n";
}
set_link("1", "$todo/attach");
finish_links();
_systemd_trigger($cmd);
}
sub split_cluster {
my ($cmd) = @_;
# checks
lprint "Checking...\n";
my $ips_backup = "$backup_dir/ips-backup";
system("mkdir -p $ips_backup") and ldie "OOps";
system("cp -a $mars/ips/* $ips_backup/");
foreach my $peer (lamport_glob("$ips_backup/ip-*")) {
$peer =~ s:^$ips_backup/ip-::;
ssh_cmd($peer, $ssh_probe);
}
lprint "Creating IP backups...\n";
# create ips backup
foreach my $peer (lamport_glob("$ips_backup/ip-*")) {
$peer =~ s:^$ips_backup/ip-::;
next if $peer eq $host;
rsync_cmd($peer, "--max-size=1 --update $peer:$mars/ips/ $ips_backup/", 1);
}
system("cp -a $mars/ips/* $ips_backup/");
my @peers = map { $_ =~ s:^$ips_backup/ip-::; $_ } (lamport_glob("$ips_backup/ip-*"));
ldie "Empty peer list\n" unless @peers;
foreach my $peer (@peers) {
next if $peer eq $host;
rsync_cmd($peer, "--max-size=1 --update $ips_backup $peer:$backup_dir/");
}
# Check symmetry
lprint "Checking symmetry of resource memberships...\n";
my $data_cmd = "ls $mars/resource-*/data-*";
my $standard = qx:$data_cmd:;
ldie "Cannot get local resource participants.\n" unless $standard;
foreach my $peer (@peers) {
next if $peer eq $host;
my $ssh_cmd = make_ssh_cmd($peer) . " '$data_cmd'";
my $compare = qx:$ssh_cmd:;
ldie "Asymmetric $mars/resource-*/data-* with peer $peer\n" unless $compare eq $standard;
}
# delete all hosts everywhere
lprint "Deleting all peers IPs everywhere...\n";
foreach my $peer (@peers) {
print "--- peer $peer\n";
my $cmd = "rm -rf $mars/ips";
ssh_cmd($peer, $cmd, 1);
}
# wait until all peer threads are gone
lprint "Waiting...\n";
sleep(10);
my $cond_cmd = "(( \$(ps ax | grep mars_peer | grep -v grep | wc -l) == 0 ))";
for (my $loop = 0; $loop < $timeout; $loop++) {
my $running = 0;
foreach my $peer (@peers) {
my $status = ssh_cmd($peer, $cond_cmd, 1);
$running++ if $status;
}
last if !$running;
print "--- $running peers have running peer threads.\n";
sleep(1);
}
for (my $loop = 0; $loop < 3; $loop++) {
# re-create all groups everywhere
lprint "RE-CREATE peer IPs...\n";
foreach my $peer (@peers) {
print "--- peer $peer\n";
my $cmd = "rm -f $mars/todo-global/delete* $mars/actual-*/msg-*connection-*; ";
$cmd .= "shopt -s nullglob; ";
$cmd .= "for i in $mars/resource-*; do if ! [[ -e \$i/data-$peer ]] && ! [[ -e \$i/replay-$peer ]]; then rm -rf $backup_dir/\${i##*/}; mv \$i $backup_dir/; fi; done; ";
$cmd .= "mkdir -p $mars/ips; ";
my $sub_list = "{ for dir in $mars/resource-*/data-$peer; do (cd \${dir%/*} && for i in data-*; do echo \$i; done); done; echo x-$peer; }";
my $sub_cmd = "echo RESTORE IP \$j; cp -a $ips_backup/ip-\$j $mars/ips/";
$cmd .= "for j in \$($sub_list | cut -d- -f2- | sort -u); do $sub_cmd; done";
lprint "$cmd\n";
ssh_cmd($peer, $cmd, 1);
}
}
}
sub leave_res_phase0 {
my ($cmd, $res) = @_;
check_not_primary(@_) unless $force;
my $errors = 0;
foreach my $tmp (lamport_glob("$mars/resource-$res/todo-$host/*")) {
next if $tmp =~ m:/delete:;
my $status = get_link($tmp, 2);
if ($status) {
lwarn "switch '$tmp' is not off\n";
$errors++;
}
}
foreach my $tmp (lamport_glob("$mars/resource-$res/actual-$host/{is-,logfile-}*")) {
my $status = get_link($tmp);
if ($status) {
lwarn "running status '$tmp' is not off\n";
$errors++;
}
}
if (!$force) {
check_status($cmd, $res, "is-attached", 0, 0, 1);
ldie "there were $errors errors.\n" if $errors;
}
}
sub leave_res_phase1 {
my ($cmd, $res) = @_;
set_link("1", "$mars/resource-$res/work-$host");
_create_delete("$mars/resource-$res/replay-$host");
_create_delete("$mars/resource-$res/data-$host");
_create_delete("$mars/resource-$res/syncstatus-$host");
my $syncpos = "$mars/resource-$res/syncpos-$host";
_create_delete($syncpos) if link_exists($syncpos);
my $skip_check = "$mars/resource-$res/skip-check-$host";
_create_delete($skip_check) if link_exists($skip_check);
my $vstatus = "$mars/resource-$res/verifystatus-$host";
_create_delete($vstatus) if link_exists($vstatus);
_create_delete("$mars/resource-$res/device-$host");
_create_delete("$mars/resource-$res/actsize-$host");
foreach my $dir (lamport_glob("$mars/resource-$res/*-$host/")) {
foreach my $tmp (lamport_glob("${dir}*")) {
_create_delete($tmp);
}
_create_delete($dir);
}
finish_links();
}
# wait for deletions (avoid races with following commands)
sub leave_res_phase2 {
my ($cmd, $res) = @_;
_wait_delete();
$force = 0; # this would be too dangerous
log_purge_res($cmd, $res);
err_purge_res($cmd, $res);
}
sub leave_res_phase3 {
my ($cmd, $res) = @_;
_wait_delete();
err_purge_res($cmd, $res);
system("rm -f $mars/resource-$res/log-*") if $host eq $real_host;
_systemd_trigger($cmd);
}
sub delete_res {
my ($cmd, $res) = @_;
my $basedir = "$mars/resource-$res";
# preconditions
if (! -d $basedir) {
lprint "resource directory '$basedir' does no longer exist.\n";
return;
}
my @host_list = lamport_glob("$basedir/replay-*");
my $cnt = scalar(@host_list);
if ($cnt > 0) {
my $h_list = join(',', map({ $_ =~ s:.*/replay-::;} (@host_list)));
ldie "resource '$res' is not empty: first remove the hosts '$h_list' via leave-resource\n" unless $force;
lwarn "BRUTE FORCE resource destruction: '$res' has $cnt members ($h_list) THESE ARE FINALLY TRASHED right now -- you are RESPONSIBLE for any subsequent problems.\n";
}
for my $path (`find $basedir/* | sort -r`) {
chomp $path;
next if $path =~ m:/(maxnr$|\.deleted-):;
_create_delete($path);
}
set_link("1", "$mars/resource-$res/work-$host");
finish_links();
_wait_delete();
_systemd_trigger($cmd);
}
sub logrotate_res {
my ($cmd, $res) = @_;
unless ($force) {
return if check_primary($cmd, $res, 0, 1);
}
my @paths = lamport_glob("$mars/resource-$res/log-*-$host");
my $last;
if (@paths) {
@paths = sort alphanum_cmp @paths;
$last = pop(@paths);
} else {
lwarn "no logfile exists on '$host' for resource '$res'\n";
my $replay = get_link("$mars/resource-$res/replay-$host");
$replay =~ m:^(log-[0-9]+-[^,]+),: or ldie "badly formed replaylink '$replay'\n";
$last = $1;
}
if (-z $last) {
lprint "an empty logfile '$last' already exists, nothing to do.\n" if $verbose;
return;
}
# prevent too many small logfiles when secondaries are not catching up
if (scalar(@paths) >= $thresh_logfiles) {
my @stat = stat($last);
my $size = 0;
if (@stat) {
$size = $stat[7] / (1000 * 1000 * 1000);
}
if ($size < $thresh_logsize) {
lprint "current logfile '$last' is smaller than $thresh_logsize GB, skipping logrotate.\n" if $verbose;
return;
}
}
my $nr = $last;
$nr =~ s/^.*log-([0-9]+)-.+$/$1/;
my $next = sprintf("$mars/resource-$res/log-%09d-$host", $nr + 1);
ldie "logfile '$next' already exists\n" if file_exists($next);
system("touch $next") unless $dry_run;
my $startnr = get_link("$mars/resource-$res/maxnr", 1);
$startnr = $nr + 1 if ($nr >= $startnr);
set_link("$startnr", "$mars/resource-$res/maxnr");
}
sub _get_deletable_logfiles {
my ($cmd, $res) = @_;
my $min = -1;
my $max = -1;
my @log_paths = lamport_glob("$mars/resource-$res/log-*") or ldie "cannot find any logfiles\n";
foreach my $path (@log_paths) {
$path =~ m/\/log-([0-9]+)-/;
my $nr = $1;
$min = $nr if ($nr < $min || $min < 0);
$max = $nr if ($nr > $max || $max < 0);
}
my @paths = lamport_glob("$mars/resource-$res/replay-*") or ldie "cannot find any replay symlinks\n";
foreach my $path (@paths) {
my $target = get_link($path);
$target =~ m/^log-([0-9]+)/;
my $nr = $1;
$max = $nr if ($nr < $max || $max < 0);
}
return ($min, $max);
}
my %delete_nrs;
sub _get_delete_nr {
my ($basedir) = @_;
return @{$delete_nrs{$basedir}} if defined($delete_nrs{$basedir});
my $min_nr = 0;
my $max_nr = 0;
foreach my $path (lamport_glob("$basedir/deleted-*")) {
my $link = get_link($path, 1);
$link =~ m/0*([0-9]+)/;
$min_nr = $1 if (defined($1) && ($1 < $min_nr || !$min_nr));
$max_nr = $1 if (defined($1) && $1 > $max_nr);
}
my @deletes = lamport_glob("$basedir/delete-*");
foreach my $path (@deletes) {
$path =~ m:/delete-0*([0-9]+):;
$max_nr = $1 if (defined($1) && $1 > $max_nr);
}
my $nr_links = $max_nr - $min_nr;
if ($nr_links > $max_deletions / 2) {
$verbose = 1 if $verbose <= 0;
lwarn "Too many deletion links have accumulated into directory $basedir/.\n";
lwarn "Probably your networking / your firewall rules / another setup problem is causing this, and your monitoring does not notice it.\n";
lwarn "Please fix it.\n";
lwarn "When necessary, remove $basedir/delete-* link by hand.\n";
# There might be holes in the sequence numbers of delete-* links.
# Don't account for them when it comes to aborts.
if (scalar(@deletes) > $max_deletions) {
lwarn "URGENT: please fix it.\n";
ldie "Aborting for safety reasons\n" if $cron_mode && !$force;
}
}
$delete_nrs{$basedir} = [$min_nr, $max_nr];
return @{$delete_nrs{$basedir}};
}
sub _set_delete_nr {
my ($basedir, $min_nr, $max_nr) = @_;
$delete_nrs{$basedir} = [$min_nr, $max_nr];
}
my %del_dirs;
my %del_targets;
sub _replace_delete {
my ($basedir, $target) = @_;
if (!$del_dirs{$basedir}) {
foreach my $link (lamport_glob("$basedir/delete-*")) {
my $targ = readlink($link);
$del_targets{$targ} = $link;
}
$del_dirs{$basedir} = 1;
}
my ($min_nr, $delete_nr) = _get_delete_nr($basedir);
if (defined($del_targets{$target})) {
my $old = $del_targets{$target};
$old =~ m:/delete-([0-9]+)-:;
my $old_nr = $1;
return $old if $old_nr >= $min_nr;
}
my $new = sprintf("$basedir/delete-%09d-$real_host", ++$delete_nr);
_set_delete_nr($basedir, $min_nr, $delete_nr);
return $new;
}
my $nr_deletions = 0;
sub _create_delete {
my ($target) = @_;
ldie "cannot delete: '$target' is no absolute path\n" unless $target =~ m:^/:;
if (!$compat_deletions) {
lprint "delete '$target'\n" if $verbose;
set_link(".deleted", $target);
return;
}
my $global_path = "$mars/todo-global";
my $new = _replace_delete($global_path, $target);
lprint "create symlink $new -> $target\n" if $verbose;
set_link($target, $new);
$nr_deletions++;
}
sub _wait_delete {
return if $dry_run;
lwarn "Do not run this in --parallel mode\n" if $child_prefix;
return if !$nr_deletions;
$nr_deletions = 0;
for (;;) {
my $deleted = get_link("$mars/todo-global/deleted-$real_host");
$deleted =~ s/^0+//;
my ($min_nr, $delete_nr) = _get_delete_nr("$mars/todo-global");
last if $deleted >= $delete_nr;
lprint "waiting for deletions to apply locally....\n";
sleep_timeout();
}
}
sub delete_file_cmd {
my $cmd = shift;
my $res = shift; # ignore this
foreach my $path (@_) {
check_userspace($path);
_create_delete($path);
}
}
sub link_purge_res {
my ($cmd, $res) = @_;
lprint "removing left-over symlinks...\n" if $verbose;
my $start_time = mars_time();
# keep internal backups for 1 week
my $keep_backups = 24 * 7;
foreach my $leftlink (lamport_glob("$mars/backup*")) {
my $stamp = get_link_stamp($leftlink);
next unless $stamp + 3600 * $keep_backups < $start_time;
lprint " unlink '$leftlink'\n" if $verbose;
system("rm -rf $leftlink");
}
# new deletion method
my %status = get_alive_links($res, "time", "*");
my $min_time = $start_time;
foreach my $peer (keys(%status)) {
my $stamp = $status{$peer};
$min_time = $stamp if $stamp < $min_time;
}
my $clean_glob = "$mars/{,resource-$res/}{,*/}{.*,*}";
foreach my $path (glob($clean_glob)) {
next unless -l $path;
my $val = readlink($path);
next unless $val eq ".deleted";
my $age = get_link_stamp($path);
next unless $age;
if ($age + $window/2 < $min_time) {
lprint " remove deleted '$path'\n" if $verbose;
unlink($path);
}
}
# old deletion method, hopefully to disappear somewhen in future
foreach my $leftlink (lamport_glob("$mars/{,resource-$res/}{,*/}{.tmp,.deleted,delete,work}-*")) {
# remove outdated .deleted-* markers
if ($leftlink =~ m:/.deleted-:) {
my $info = get_link($leftlink, 2);
if ($info =~ m/^[0-9]+,[0-9]+$/) {
my ($serial, $mode) = split(/,/, $info);
my $todo_glob = "$mars/todo-global/deleted-*";
if ($mode) {
$todo_glob = "$mars/resource-$res/todo-$real_host/deleted-*"
}
# compute the minimum border
my $border = 0;
foreach my $deleted_link (lamport_glob($todo_glob)) {
my $limit = get_link($deleted_link, 2);
next if !$limit || $limit <= 1;
$border = $limit if (!$border || $limit < $border);
}
if ($serial < $border && $serial > 1) {
unlink($leftlink);
next;
}
}
}
my $stamp = get_link_stamp($leftlink);
next unless $stamp + 3600 * 24 < $start_time;
lprint " unlink '$leftlink'\n" if $verbose;
unlink($leftlink);
}
foreach my $leftlink (lamport_glob("$mars/{,resource-$res/}actual-*/msg-*")) {
if ($leftlink =~ m:/msg-conncetion-from-:) {
lprint " unlink deprecated '$leftlink'\n" if $verbose;
unlink($leftlink);
next;
}
my $stamp = get_link_stamp($leftlink);
next unless $stamp + 3600 * 4 < $start_time;
lprint " unlink '$leftlink'\n" if $verbose;
unlink($leftlink);
}
}
sub logdelete_res {
my ($cmd, $res) = @_;
lprint "removing left-over $res logfiles...\n" if $verbose;
my $start_time = mars_time();
my @paths = lamport_glob("$mars/resource-$res/log-*") or ldie "cannot find any logfiles\n";
@paths = sort alphanum_cmp @paths;
my ($min_deletable, $max_deletable) = _get_deletable_logfiles(@_);
lprint "min deletable logfile number: $min_deletable\n" if $verbose;
lprint "min non-deletable logfile number: $max_deletable\n" if $verbose;
if ($min_deletable >= $max_deletable) {
lprint "no logfiles are deletable.\n" if $verbose;
return;
}
if ($cmd eq "log-delete-one") {
$max_deletable = $min_deletable + 1; # delete only the first one
}
my $nr = 0;
for (;;) {
my $first = shift(@paths);
last unless $first;
$nr = $first;
$nr =~ s/^.*log-([0-9]+)-.+$/$1/;
next unless $nr < $max_deletable;
lprint "chosen '$first' for deletion\n" if $verbose;
_create_delete($first);
}
my @versionlinks = lamport_glob("$mars/resource-$res/version-*");
# dont remove versionlinks during split-brain as long as possible
if (scalar(@versionlinks) < $max_deletions / 8 &&
!detect_splitbrain($res, 1)) {
lwarn "SPLIT BRAIN: keep some versionlinks for better reporting\n";
return unless $force;
}
lprint "removing left-over version symlinks...\n" if $verbose;
foreach my $versionlink (@versionlinks) {
my $nrv = $versionlink;
$nrv =~ s/^.*\/version-([0-9]+)-.+$/$1/;
# we need at least one more version link than logfiles for consistency checks
next unless $nrv < $max_deletable - 1;
_create_delete($versionlink);
}
# remove outdated split-cluster remains
my %peers;
foreach my $path (lamport_glob("$mars/ips/ip-*")) {
$path =~ m:/ip-(.*):;
my $peer = $1;
$peers{$peer} = 1;
}
foreach my $leftlink (lamport_glob("$mars/{,resource-$res/}todo-*/deleted-*")) {
$leftlink =~ m:/deleted-(.*):;
my $peer = $1;
next if $peers{$peer};
my $stamp = get_link_stamp($leftlink);
next unless $stamp + 3600 < $start_time;
lprint " unlink '$leftlink'\n" if $verbose;
unlink($leftlink);
}
}
sub cron_res {
my ($cmd, $res) = @_;
$cron_mode = 1;
link_purge_res(@_);
logrotate_res(@_);
finish_links();
sleep(5);
logdelete_res(@_);
}
sub attach_res_phase0 {
my ($cmd, $res) = @_;
return if $force;
my $detach = ($cmd eq "detach");
if ($detach) {
my $device_in_use = get_link("$mars/resource-$res/actual-$host/open-count", 1);
if ($device_in_use) {
my $want_path = "$mars/resource-$res/systemd-want";
my $want = get_link($want_path, 2);
if ($want) {
lprint "IMPORTANT: Relying on systemd for $cmd of resource '$res'\n";
my $path = "$mars/resource-$res/todo-$host/attach";
_switch($cmd, $res, $path, 0);
finish_links();
systemd_activate($cmd, $res, 0, 1);
return;
}
my $name = get_link("$mars/resource-$res/device-$host");
my $dev = "/dev/mars/$name";
ldie "device '$dev' is in use\n";
}
}
}
# only for systemd: wait that primary device is no longer open
sub attach_res_phase0b {
my ($cmd, $res) = @_;
return unless $cmd eq "detach";
return unless systemd_present(@_);
check_status($cmd, $res, "open-count", 0, 1);
wait_cluster($cmd);
}
sub attach_res_phase1 {
my ($cmd, $res) = @_;
my $detach = ($cmd eq "detach");
my $path = "$mars/resource-$res/todo-$host/attach";
_switch($cmd, $res, $path, !$detach);
}
sub attach_res_phase2 {
my ($cmd, $res) = @_;
my $detach = ($cmd eq "detach");
return if $force;
check_status($cmd, $res, "is-attached", $detach ? 0 : 1, 1);
if ($detach) {
system("sync");
check_mars_device($cmd, $res, 1, 1);
check_status($cmd, $res, "is-replaying", 0, 1);
check_status($cmd, $res, "is-syncing", 0, 1);
system("sync");
}
}
sub fetch_global_res {
my ($cmd, $res) = @_;
my $pause = ($cmd =~ m/disconnect|pause/);
my @paths = lamport_glob("$mars/resource-$res/todo-*/");
for my $path (@paths) {
_switch($cmd, $res, "$path/connect", !$pause);
}
}
sub fetch_local_res {
my ($cmd, $res) = @_;
my $pause = ($cmd =~ m/disconnect|pause/);
my $path = "$mars/resource-$res/todo-$host/connect";
_switch($cmd, $res, $path, !$pause);
}
sub pause_sync_global_res {
my ($cmd, $res) = @_;
my $pause = ($cmd =~ m/pause/);
check_sync_startable(@_) if !$pause;
my @paths = lamport_glob("$mars/resource-$res/todo-*/");
for my $path (@paths) {
_switch($cmd, $res, "$path/sync", !$pause);
}
}
sub pause_sync_local_res {
my ($cmd, $res) = @_;
my $pause = ($cmd =~ m/pause/);
check_sync_startable(@_) if !$pause;
my $path = "$mars/resource-$res/todo-$host/sync";
_switch($cmd, $res, $path, !$pause);
}
sub pause_replay_global_res {
my ($cmd, $res) = @_;
my $pause = ($cmd =~ m/pause/);
my @paths = lamport_glob("$mars/resource-$res/todo-*/");
for my $path (@paths) {
_switch($cmd, $res, "$path/replay", !$pause);
}
}
sub pause_replay_local_res {
my ($cmd, $res) = @_;
my $pause = ($cmd =~ m/pause/);
my $path = "$mars/resource-$res/todo-$host/replay";
_switch($cmd, $res, $path, !$pause);
}
sub up_res_phase0 {
my ($cmd, $res) = @_;
my $down = ($cmd eq "down");
if ($down) {
attach_res_phase0("detach", $res);
} else {
attach_res_phase0("attach", $res);
}
}
sub up_res_phase1 {
my ($cmd, $res) = @_;
my $down = ($cmd eq "down");
if ($down) {
pause_replay_local_res("pause-replay-local", $res);
pause_sync_local_res("pause-sync-local", $res);
fetch_local_res("pause-fetch", $res);
attach_res_phase1("detach", $res);
} else {
attach_res_phase1("attach", $res);
fetch_local_res("resume-fetch-local", $res);
# ignore ldie on sync, just do all the rest
eval {
pause_sync_local_res("resume-sync-local", $res);
};
pause_replay_local_res("resume-replay-local", $res);
}
}
sub up_res_phase2 {
my ($cmd, $res) = @_;
my $down = ($cmd eq "down");
if ($down) {
attach_res_phase2("detach", $res);
} else {
attach_res_phase2("attach", $res);
}
}
sub set_replay_res {
my ($cmd, $res, $new_nr) = @_;
if (!$new_nr || $new_nr <= 0) {
ldie "you must supply a numeric logfile number as third argument.\n";
}
check_not_primary(@_);
check_todo($cmd, $res, "replay", 0, 0);
my $replaylink = "$mars/resource-$res/replay-$host";
my $old_val = get_link($replaylink);
my $old_nr = $old_val;
$old_nr =~ s/log-([0-9]+)-.*/$1/;
ldie "old log number '$old_nr' is wrong\n" unless $old_nr > 0;
if ($new_nr > $old_nr) {
lwarn "you try to skip logfile numbers from $old_nr to $new_nr, are you sure?\n";
ldie "you would need --force if you really know what you are doing.\n" unless $force;
}
my $primary = _get_designated_primary($res);
_set_replaylink("$mars/resource-$res", $new_nr, $primary);
}
sub fake_sync_phase1 {
my ($cmd, $res) = @_;
my $path = "$mars/resource-$res/todo-$host/sync";
_switch($cmd, $res, $path, 0);
}
sub fake_sync_phase2 {
my ($cmd, $res) = @_;
check_status($cmd, $res, "is-syncing", 0, 1);
}
sub fake_sync_phase3 {
my ($cmd, $res) = @_;
my $size = get_link("$mars/resource-$res/size");
my $target = "$mars/resource-$res/syncstatus-$host";
set_link($size, $target);
}
sub _primary_res {
my ($res, $new, $old) = @_;
my $pri = "$mars/resource-$res/primary";
set_link($new, $pri);
unless ($new eq "(none)") {
my $last_lnk = "$mars/resource-$res/userspace/last-primary";
my $prev_lnk = "$mars/resource-$res/userspace/prev-primary";
system("mv -f $last_lnk $prev_lnk");
set_link($new, $last_lnk);
}
lprint "designated primary changed from '$old' to '$new'\n";
}
# check whether primary/secondary switching is possible at all
sub primary_phase0 {
my ($cmd, $res) = @_;
ldie "cannot switch primary: mars kernel module is not loaded\n" unless ($cmd eq "secondary" || -d "/proc/sys/mars");
if ($force) {
check_todo($cmd, $res, "fetch", 0, 0);
}
my $old = _get_designated_primary($res, -1);
lprint "Current designated primary: $old\n";
if ($cmd eq "primary") {
if ($host ne $old) {
lprint "Allowing handover in cases of sync: ignore_sync=$ignore_sync\n" if $ignore_sync;
check_sync_finished($res, $host, $ignore_sync);
# also check that other secondaries won't loose their sync primary
my @names = lamport_glob("$mars/resource-$res/data-*");
# for k <= 2 replicas, the previous check must have been sufficient
if (scalar(@names) > 2) {
my $allow_anyway = ($force || $ignore_sync);
lprint "Allowing handover in cases of sync: force=$force ignore_sync=$ignore_sync\n" if $allow_anyway;
foreach my $name (@names) {
$name =~ m:/data-(.+):;
my $peer = $1;
next if ($peer eq $old || $peer eq $host);
check_sync_finished($res, $peer, $allow_anyway);
}
}
}
check_todo($cmd, $res, "attach", 1, 0);
check_todo($cmd, $res, "fetch", 1, 0) if !$force;
check_todo($cmd, $res, "replay", 1, 0);
# check that no logfile replay errors exist.
my $replay_error = get_link("$mars/resource-$res/actual-$host/replay-code", 2);
if (defined($replay_error) && $replay_error ne "" && int($replay_error) < 0) {
lwarn "Logfile replay / recovery stopped with error code $replay_error.\n";
ldie "Won't switch to avoid unnoticed data loss. You may however do a 'primary --force'.\n" unless $force;
}
}
my $want_path = "$mars/resource-$res/systemd-want";
my $want = get_link($want_path, 2);
if ($want) {
my $new;
my $oper;
if ($cmd eq "primary") {
$new = $host;
$oper = "start";
} else {
$new = "(none)";
$oper = "stop";
}
set_link($new, $want_path);
my $unit_path = "$mars/resource-$res/systemd-$oper-unit";
my $unit = get_link($unit_path, 2);
if ($unit) {
if ($old ne "(none)") {
my $response_path = "$mars/resource-$res/userspace/systemd-status-stop-$old";
set_link(0, $response_path);
}
lprint "IMPORTANT: Relying on systemd for $oper of unit '$unit'\n";
lprint "IMPORTANT: unit '$unit' wanted at '$new'\n";
finish_links();
_systemd_trigger($cmd);
_trigger(3);
return;
}
}
return if ($old eq $host and $cmd eq "primary");
return if $old eq "(none)";
my $open_count_path = "$mars/resource-$res/actual-$old/open-count";
my $device_in_use = get_link($open_count_path, 1);
if ($device_in_use) {
my $name = get_link("$mars/resource-$res/device-$old", 1) || "unknown";
lwarn "device '/dev/mars/$name' for resource '$res' is $device_in_use times in use on primary host '$old'\n";
ldie "first you must umount/close the device (on host '$old')\n" unless $force;
lwarn "First you SHOULD umount/close the device (on host '$old'), but you ignore this recommendation by giving the --force option.\n";
if (is_link_recent($open_count_path)) {
lwarn "You are forcing a SPLIT BRAIN via --force right now. Do you know that this is an ERRONEOUS state? Do you really know what you are doing?\n";
} else {
lwarn "You may produce a SPLIT BRAIN via --force because the peer host '$old' is currently not reachable.\n";
}
}
lprint "all preconditions OK for resource '$res'\n";
}
# only for primary --force: wait until fetch has actually stopped
sub primary_phase0a {
my ($cmd, $res) = @_;
return unless $force;
wait_cond($cmd, $res, "is-fetch-off");
}
# only for systemd: wait that primary device is no longer open
sub primary_phase0b {
my ($cmd, $res) = @_;
return unless systemd_present(@_);
# only relevant for true handover
my $old = _get_designated_primary($res, -1);
return if ($old eq $host || $old eq "(none)");
# ignore primary --force
my $connect_path = "$mars/resource-$res/todo-$host/connect";
my $connect = get_link($connect_path, 1);
return if (!defined($connect) || !$connect);
# Notice: this is a workaround for a problem which is
# outside of our scope. For example, a remote umount will
# fail when any (delayed) process has any filehandle open.
# As long as the umount was unsuccessful, we _cannot_ proceed
# with planned handover.
# As a workaround, we constantly trigger the remote systemd
# in the _hope_ that the umount will succeed, and the
# open-count will then go down to zero, hopefully somewhen.
my $watch = "$mars/resource-$res/systemd-want";
my $action = "";
if (link_exists($watch)) {
$action = "system(\"touch -h $watch\");";
my $response_path = "$mars/resource-$res/userspace/systemd-status-stop-$old";
$action .= "\$action_status = get_link(\"$response_path\");";
my $msg = "systemctl stop on peer $old: status=\$action_status\n";
$action .= "ldie \"$msg\" if \$action_status;";
}
# try to compensate failures by systemd restart
my $compensation = undef;
if ($old ne "(none)") {
$compensation = "lprint \"Restarting '$res' on '$old'\n\"; ";
$compensation .= "_primary_res(\"$res\", \"$old\", \"(none)\"); ";
$compensation .= "set_systemd_want(\"$cmd\", \"$res\", \"$old\"); ";
$compensation .= "finish_links(); ";
$compensation .= "_trigger(3); ";
$compensation .= "lprint \"Triggered systemd at '$old'.\n\";";
}
check_status($cmd, $res, "open-count", 0, 1, undef, undef, $old, $action, $compensation);
}
# when necessary, switch to secondary (intermediately)
sub primary_phase1 {
my ($cmd, $res) = @_;
return 0 if ($force and $cmd eq "primary");
my $old = _get_designated_primary($res, -1);
return 0 if ($old eq $host and $cmd eq "primary");
my $new = "(none)";
if (!$force and $cmd eq "primary") {
my $status = try_to_avoid_splitbrain($cmd, $res, $old);
return $status if $status;
}
return 0 if $old eq $new;
_primary_res($res, $new, $old);
return 0;
}
my $phase2_waited = 0;
sub primary_phase1b {
my ($cmd, $res) = @_;
$phase2_waited = 0;
finish_links();
return 0 if $force;
my $old = _get_designated_primary($res, -1);
my $status = check_primary_gone($cmd, $res, $old);
return $status if $status;
if (!$force and $cmd eq "primary") {
my $status = try_to_avoid_splitbrain($cmd, $res, $old);
return $status if $status;
}
return 0;
}
# when necessary, wait
sub primary_phase2 {
my ($cmd, $res) = @_;
return 0 if $force;
return 0 unless $cmd eq "primary";
wait_cluster($cmd) if !$phase2_waited++;
my $old = _get_designated_primary($res, -1);
return check_primary_gone($cmd, $res, $old);
}
sub primary_phase2b {
my ($cmd, $res) = @_;
if (systemd_present(@_)) {
my $old = _get_designated_primary($res, -1);
return try_to_avoid_splitbrain($cmd, $res, $old);
}
return 0;
}
# when necessary, switch to primary
sub primary_phase3 {
my ($cmd, $res) = @_;
return unless $cmd eq "primary";
my $old = _get_designated_primary($res, -1);
my $new = $host;
_primary_res($res, $new, $old);
}
sub primary_phase3b {
finish_links();
}
# wait for device to appear / disappear
sub primary_phase4 {
my ($cmd, $res) = @_;
if($cmd eq "secondary") {
check_mars_device($cmd, $res, 1, 1);
return;
}
my $ok = detect_splitbrain($res, 1);
if (!$ok) {
lwarn "\n";
lwarn "Sorry, in split brain situations I can only set the _designated_\n";
lwarn "primary, but I cannot _guarantee_ that becoming the\n";
lwarn "_actual_ primary is always possible.\n";
lwarn "You SHOULD resolve the split brain ASAP (e.g. by leave-resource\n";
lwarn "or invalidate etc).\n";
lwarn "\n";
lwarn "If you already tried to resolve the split brain manually, but\n";
lwarn "this message does not disappear, the reason could be some\n";
lwarn "hindering left-overs/remains from the former split brain.\n";
lwarn "ONLY in such a case, try log-purge-all --force.\n";
lwarn "\n";
return;
}
check_mars_device($cmd, $res, 1, 0);
_systemd_trigger($cmd);
}
sub wait_umount_res {
my ($cmd, $res) = @_;
while (1) {
my $sum = 0;
foreach my $path (lamport_glob("$mars/resource-$res/actual-$host/open-count")) {
$sum += get_link($path);
}
last if !$sum;
lprint "device for resource '$res' is $sum times in use on $host\n";
sleep_timeout(1);
}
lprint "OK, device for resource '$res' is not in use.\n";
}
sub invalidate_res_phase0 {
my ($cmd, $res) = @_;
check_not_primary(@_);
my $primary = _get_designated_primary($res);
ldie "for operation '$cmd', some other designated primary must exist (currently there is none)\n" if $primary eq "(none)";
ldie "Cannot invalidate the designated primary host '$primary'\n" if $primary eq $host;
}
sub invalidate_res_phase1 {
my ($cmd, $res) = @_;
_switch($cmd, $res, "$mars/resource-$res/todo-$host/attach", 0);
_switch($cmd, $res, "$mars/resource-$res/todo-$host/sync", 0);
_switch($cmd, $res, "$mars/resource-$res/todo-$host/fetch", 0);
_switch($cmd, $res, "$mars/resource-$res/todo-$host/replay", 0);
}
sub invalidate_res_phase2 {
my ($cmd, $res) = @_;
if (!$force) {
check_status($cmd, $res, "is-syncing", 0, 1);
check_status($cmd, $res, "is-fetching", 0, 1);
check_status($cmd, $res, "is-replaying", 0, 1);
check_status($cmd, $res, "is-attached", 0, 1);
}
}
sub invalidate_res_phase3 {
my ($cmd, $res) = @_;
my $dst = "$mars/resource-$res/syncstatus-$host";
my $primary = _get_designated_primary($res);
ldie "Cannot execute 'invalidate' because noone is designated as primary.\n" if (!$primary || $primary eq "(none)");
ldie "Cannot invalidate the designated primary host '$primary'\n" if $primary eq $host;
my $replay = get_link("$mars/resource-$res/replay-$primary");
$replay =~ m/^log-([0-9]+)-/ or ldie "replay link '$replay' is not parsable\n";
my $replay_nr = $1;
set_link("0", $dst);
finish_links(); # opportunity for errors => don't continue
for my $vers_path (lamport_glob("$mars/resource-$res/version-*-$host")) {
$vers_path =~ m:/version-([0-9]+):;
my $this_nr = $1;
_create_delete($vers_path) if $this_nr >= $replay_nr;
}
_create_delete("$mars/resource-$res/replay-$host");
finish_links();
}
sub invalidate_res_phase4 {
my ($cmd, $res) = @_;
_wait_delete();
}
sub invalidate_res_phase5 {
my ($cmd, $res) = @_;
$force = 0; # this would be too dangerous
log_purge_res(@_);
err_purge_res(@_);
}
sub invalidate_res_phase6 {
my ($cmd, $res) = @_;
_wait_delete();
}
sub invalidate_res_phase7 {
my ($cmd, $res) = @_;
my $dst = "$mars/resource-$res/syncstatus-$host";
my $primary = _get_designated_primary($res);
ldie "Cannot execute 'invalidate' because noone is designated as primary.\n" if (!$primary || $primary eq "(none)");
ldie "Cannot invalidate the designated primary host '$primary'\n" if $primary eq $host;
my $replay = get_link("$mars/resource-$res/replay-$primary");
$replay =~ m/^log-([0-9]+)-/ or ldie "replay link '$replay' is not parsable\n";
my $replay_nr = $1;
_set_replaylink("$mars/resource-$res", $replay_nr, $primary, "");
finish_links();
}
sub invalidate_res_phase8 {
my ($cmd, $res) = @_;
_wait_delete();
}
sub invalidate_res_phase9 {
my ($cmd, $res) = @_;
my $dst = "$mars/resource-$res/syncstatus-$host";
my $primary = _get_designated_primary($res);
ldie "Cannot execute 'invalidate' because noone is designated as primary.\n" if (!$primary || $primary eq "(none)");
ldie "Cannot invalidate the designated primary host '$primary'\n" if $primary eq $host;
_switch($cmd, $res, "$mars/resource-$res/todo-$host/attach", 1);
_switch($cmd, $res, "$mars/resource-$res/todo-$host/fetch", 1);
_switch($cmd, $res, "$mars/resource-$res/todo-$host/replay", 1);
finish_links();
my $emergency_path = "$mars/resource-$res/actual-$primary/has-emergency";
my $emergency = get_link($emergency_path, 1);
if ($emergency) {
lwarn "Primary '$primary' is in emergency mode. Cannot start sync.\n";
} else {
_switch($cmd, $res, "$mars/resource-$res/todo-$host/sync", 1);
}
}
my %resize_device_size;
my %resize_old_size;
my %resize_new_size;
sub get_possible_size {
my ($cmd, $res) = @_;
my @actsizes = lamport_glob("$mars/resource-$res/actsize-*");
lwarn "resource $res has no actsize-* symlinks\n" unless @actsizes;
my $possible_size = 0;
foreach my $actsize (@actsizes) {
my $this_size = get_link($actsize);
if (!$possible_size || $this_size < $possible_size) {
$possible_size = $this_size;
}
}
return $possible_size;
}
sub resize_phase0 {
my ($cmd, $res, $size_arg) = @_;
ldie "mars kernel module is not loaded. This is needed for communication with some other hosts!\n" if !is_module_loaded();
my $new_size = 0;
if ($size_arg) {
$new_size = get_size($size_arg);
ldie "optional size argument '$new_size' must be numeric and positive\n" unless scalar($new_size) > 0;
lprint "new size: $new_size bytes\n";
}
check_primary($cmd, $res, 1);
my $my_size = get_link("$mars/resource-$res/actsize-$host");
my $lnk = "$mars/resource-$res/size";
my $old_size = get_link($lnk);
lprint "old_size=$old_size\n";
my $possible_size = get_possible_size($cmd, $res);
lprint "possible_size=$possible_size\n";
$new_size = $possible_size if !$new_size;
lprint "new_size=$new_size\n";
ldie "new size $new_size is higher than the possible size (minimum of all volumes) $possible_size" if $new_size > $possible_size; # no override with --force possible
# disallow decreasing
ldie "only increases of the size are possible!\n" if $new_size < $old_size;
my $waste = $my_size - $new_size;
lwarn "You are wasting $waste bytes locally\n" if $my_size > $new_size;
# remember values
$resize_device_size{$res} = _get_mars_size(@_);
$resize_old_size{$res} = $old_size;
lwarn "internal mismatch between actual device size and resource size: $resize_device_size{$res} != $resize_old_size{$res}\n" unless $resize_device_size{$res} == $resize_old_size{$res};
$resize_new_size{$res} = $new_size;
return 0;
}
sub resize_phase1 {
my ($cmd, $res) = @_;
my $old_size = $resize_old_size{$res} or ldie "bad internal size value\n";
my $new_size = $resize_new_size{$res} or ldie "bad internal size value\n";
# for safety, check again
ldie "only increases of the size are possible!\n" if $new_size < $old_size;
check_primary($cmd, $res, 1);
# Mark the primary data / its size as authoritative
my $act_lnk = "$mars/resource-$res/syncstatus-$host";
set_link($new_size, $act_lnk);
finish_links(); # Chance for errors to pop up
# Now set the new resource size
my $lnk = "$mars/resource-$res/size";
set_link($new_size, $lnk);
finish_links();
}
sub resize_phase2 {
my ($cmd, $res) = @_;
my $old_size = $resize_old_size{$res} or ldie "bad internal size value\n";
my $new_size = $resize_new_size{$res} or ldie "bad internal size value\n";
for (;;) {
my $new_device_size = _get_mars_size(@_);
if ($new_device_size == $resize_new_size{$res}) {
lprint "Device size is now $new_device_size.\n";
last;
}
lprint "Device size $new_device_size has not yet reached the new size $resize_new_size{$res}.\n";
if ($new_device_size != $resize_device_size{$res}) {
lwarn "The size has changed, but did not reach the correct value.";
lwarn "Assuming some rounding problems (which may occur at some device types)\n";
last;
}
sleep_timeout();
}
}
sub role_cmd {
my ($cmd, $res) = @_;
my $primary = _get_actual_primary($res) || '(none)';
my $todo_primary = _get_designated_primary($res);
my $msg = "I am actually ";
$msg .= ($primary eq $host) ? "primary" : "secondary";
if ($primary eq $todo_primary) {
$msg .= " and $primary is primary" if ($primary ne $host);
}
elsif ($primary ne $todo_primary) {
$todo_primary = "I" if ($todo_primary eq $host);
$msg .= " and $todo_primary should be primary";
}
lprint $msg . "\n";
}
sub mars_state_cmd {
my ($cmd, $res) = @_;
my $primary = _get_actual_primary($res) || '(none)';
my $todo_primary = _get_designated_primary($res);
if ($primary eq $host) {
lprint "is_primary\n";
return;
}
elsif ($todo_primary eq $host) {
lprint "becoming_primary\n";
return;
}
# secondary without ambitions to become primary
my $size = get_link("$mars/resource-$res/size");
my $syncstatus = get_link("$mars/resource-$res/syncstatus-$host");
if ($syncstatus != $size) {
lprint "secondary inconsistent ($syncstatus bytes of $size)\n";
return;
}
if ($primary eq "(none)") {
my $min = 0;
foreach my $path (lamport_glob("$mars/resource-$res/log-*")) {
my $nr = $path;
$nr =~ s:^.*[a-z]+-([0-9]+)(-[^/]*)?$:$1:;
if ($nr > $min) {
$primary = $path;
$primary =~ s:^.*/[a-z]+-[0-9]+-([^/]*)$:$1:;
$min = $nr;
}
}
}
my $primary_replay = get_link("$mars/resource-$res/replay-$primary");
my $host_replay = get_link("$mars/resource-$res/replay-$host");
if ($primary_replay eq $host_replay) {
lprint "secondary uptodate\n";
return;
}
lprint "secondary outdated ($host_replay instead of $primary_replay)\n";
}
sub cat_cmd {
my $cmd = shift;
foreach my $path (@_) {
_get_text($path, undef, 1);
}
}
sub mars_info_cmd {
my ($cmd, $res) = @_;
my $info = "$mars/resource-$res/logstatus-$host.status";
cat_cmd($cmd, $info);
}
sub show_cmd {
my ($cmd, $res) = @_;
$res = "*" if !$res || $res eq "all";
my $glob = "$mars/{ips/ip-$host,alive-$host,emergency-$host,rest-space-$host,resource-$res/{device,primary,size,actsize-$host,syncstatus-$host,replay-$host,actual-$host/*,todo-$host/*}}";
foreach my $link (lamport_glob($glob)) {
next unless link_exists($link);
my $res = get_link($link);
my $short = $link;
$short =~ s:^$mars/::;
lprint "$short=$res\n";
}
}
sub show_errors_cmd {
my ($cmd, $res) = @_;
my $text = get_error_text(@_);
if ($text) {
lprint $text;
ldie "resource $res has some (old) problems.\n";
} else {
lprint "no errors/warnings are reported.\n";
}
}
sub version {
lprint "$0 Version: $marsadm_version $Id\n";
#lprint "my IP is $ip\n";
exit 0;
}
##################################################################
# pretty printing
sub seconds2human {
my $seconds = shift;
return "--:--:--" unless (defined($seconds) && $seconds ne "" && $seconds >= 0);
return sprintf("%02d:%02d:%02d", $seconds / 3600, ($seconds % 3600) / 60, $seconds % 60);
}
sub number2human {
my ($number, $unit, $max) = @_;
$max = $number if !defined($max);
my $k = 1024.0;
my $flip_over = 9.99;
if (!defined($unit) || !$unit) {
if ($max >= $k * $k * $k * $k * $flip_over) {
$unit = "T";
} elsif ($max >= $k * $k * $k * $flip_over) {
$unit = "G";
} elsif ($max >= $k * $k) {
$unit = "M";
} elsif ($max >= $k) {
$unit = "K";
} else {
$unit = "";
}
}
my $i = "i";
if ($unit =~ m/^[a-z]/) {
$i = "";
$k = 1000.0;
}
$_ = $unit;
SWITCH: {
if (/t/i) {
$number /= $k * $k * $k * $k;
$unit = "T${i}B";
last SWITCH;
}
if (/g/i) {
$number /= $k * $k * $k;
$unit = "G${i}B";
last SWITCH;
}
if (/m/i) {
$number /= $k * $k;
$unit = "M${i}B";
last SWITCH;
}
if (/k/i) {
$number /= $k;
$unit = "K${i}B";
last SWITCH;
}
$unit = "B";
}
if ($unit eq "B") {
$number = int($number + 0.1);
} else {
$number = sprintf("%.3f", $number);
}
return "$number $unit";
}
sub progress_bar {
my ($length, $min, $mid, $max) = @_;
$min = 0 if $min < 0;
$mid = $min if $mid < $min;
$max = $mid if $max < $mid;
$max = 1 if $max < 1;
my $pos1 = $length * $min / $max;
my $count = $pos1 - 1;
$count = 0 if $count < 0;
my $bar = '=' x $count;
if ($pos1 < $length) {
$bar .= ">";
$pos1++;
}
my $pos2 = $length * $mid / $max;
if ($pos1 < $pos2) {
$bar .= ':' x ($pos2 - $pos1);
}
if ($pos2 < $length) {
$bar .= '.' x ($length - $pos2);
}
return "[$bar]";
}
##################################################################
# macro evaluation
sub make_numeric {
my $number = shift;
return 0 if (!defined($number) || $number eq "");
return $number;
}
sub set_args {
my $outer_env = shift;
my $inner_env = shift;
$$inner_env{"callstack"} .= "," if $$inner_env{"callstack"};
$$inner_env{"callstack"} .= ${_[0]};
my $index = 0;
while (defined(my $next = shift)) {
my $arg = parse_macro($next, $outer_env);
$$inner_env{$index++} = $arg;
}
# clear all other number variables to avoid confusion
while (defined($$inner_env{$index})) {
undef $$inner_env{$index++};
}
}
# evaluate a primitive macro
sub eval_fn {
my $env = shift;
my $fn = shift;
# optionally allow primitive macros without prefix primitive- to be substituted
if ($fn !~ s/^primitive[-_]?//) {
my $macro = get_macro($fn, 1);
if ($macro) {
set_args($env, $env, $fn, @_);
return parse_macro($macro, $env);
}
}
my $arg1 = shift;
$_ = $fn;
SWITCH: {
if (/^$/) { # variable
my $varname;
# prefix *crement operators
if ($arg1 =~ m/^([-+]{2})$/) {
my $op = $arg1;
$varname = parse_macro(shift, $env);
if ($op =~ m/^\+/) {
$$env{$varname}++;
} else {
$$env{$varname}--;
}
} else {
$varname = parse_macro($arg1, $env);
}
my $result = "";
if (defined($$env{$varname})) {
$result = $$env{$varname};
}
# postfix *crement operators
if (defined(${_[0]}) && ${_[0]} =~ m/^([-+]{2})$/) {
my $op = shift;
if ($op =~ m/^\+/) {
$$env{$varname}++;
} else {
$$env{$varname}--;
}
}
# provisionary light-weight arrays based on CSV format
if (defined(my $delim = shift) && defined(my $index = shift)) {
$delim = parse_macro($delim, $env);
$index = parse_macro($index, $env);
my @list = split($delim, $result);
# get last element when denoted
$index = scalar(@list) - 1 if ($index eq "" || $index eq "*");
$result = $list[$index];
}
return $result;
}
if (/^let$/) { # assignment
my $varname = parse_macro($arg1, $env);
my $arg2 = shift;
my $value = parse_macro($arg2, $env);
$$env{$varname} = $value;
return "";
}
if (/^append$/) { # .=
my $varname = parse_macro($arg1, $env);
my $arg2 = shift;
my $value = parse_macro($arg2, $env);
$$env{$varname} .= $value;
return "";
}
if (/^set$/) { # provisionary light-weight arrays based on CSV format
my $varname = parse_macro($arg1, $env);
my $delim = shift;
$delim = parse_macro($delim, $env);
my $index = shift;
$index = parse_macro($index, $env);
my @list = split($delim, $$env{$varname});
# append to list when denoted
$index = scalar(@list) if ($index eq "" || $index eq "*");
my $value = shift;
$value = parse_macro($value, $env);
$list[$index] = $value;
$$env{$varname} = join($delim, @list);
return "";
}
if (/^dump[-_]?vars$/) { # write to stderr
foreach my $key (sort alphanum_cmp keys(%$env)) {
next if $key =~ m/^__.*__$/;
my $val = $$env{$key};
lprint_stderr "$key='$val'\n";
}
return "";
}
if (/^([-+*\/%&|^]|>>|<<|min|max)$/) { # arithmetic / associative operators
my $op = $1;
my $number = make_numeric(parse_macro($arg1, $env));
while (defined(my $next = shift)) {
my $operand = make_numeric(parse_macro($next, $env));
$_ = $op;
if (/^\+$/) { $number += $operand; next; }
if (/^-$/) { $number -= $operand; next; }
if (/^\*$/) { $number *= $operand; next; }
if (/^\/$/) { $number /= $operand; next; }
if (/^%$/) { $number %= $operand; next; }
if (/^&$/) { $number &= $operand; next; }
if (/^\|$/) { $number |= $operand; next; }
if (/^\^$/) { $number ^= $operand; next; }
if (/^<<$/) { $number <<= $operand; next; }
if (/^>>$/) { $number >>= $operand; next; }
if (/^min$/) { $number = $operand if $number < $operand; next; }
if (/^max$/) { $number = $operand if $number > $operand; next; }
ldie "bad arithmetic operator '$op'";
}
return $number;
}
if (/^([<>]=?|[!=]=)$/) { # numeric comparisons
my $op = $1;
my $n1 = make_numeric(parse_macro($arg1, $env));
my $arg2 = shift;
my $n2 = make_numeric(parse_macro($arg2, $env));
$_ = $op;
if (/^<$/) { return $n1 < $n2; }
if (/^>$/) { return $n1 > $n2; }
if (/^<=$/) { return $n1 <= $n2; }
if (/^>=$/) { return $n1 >= $n2; }
if (/^==$/) { return $n1 == $n2; }
if (/^!=$/) { return $n1 != $n2; }
ldie "bad comparison operator '$op'";
}
if (/^(lt|gt|le|ge|eq|ne|match|=~)$/) { # binary string operators
my $op = $1;
$op = "=~" if $op eq "match";
my $n1 = parse_macro($arg1, $env);
my $arg2 = shift;
my $n2 = parse_macro($arg2, $env);
$_ = $op;
if (/^lt$/) { return $n1 lt $n2; }
if (/^gt$/) { return $n1 gt $n2; }
if (/^le$/) { return $n1 le $n2; }
if (/^ge$/) { return $n1 ge $n2; }
if (/^eq$/) { return $n1 eq $n2; }
if (/^ne$/) { return $n1 ne $n2; }
if (/^=~$/) {
my $opts = "m";
my $arg3 = shift;
$opts = parse_macro($arg3, $env) if defined($arg3);
# unfortunately standard regex operators don't seem to accept variable options, so we use eval()
my $result = eval("\$n1 =~ m{$n2}$opts");
return "" unless defined($result);
return $result;
}
ldie "bad binary operator '$op'";
}
if (/^(&&|\|\||and|or)$/) { # shortcut operators
my $op = $1;
$op = "&&" if $op eq "and";
$op = "||" if $op eq "or";
my $number = parse_macro($arg1, $env);
while (defined(my $next = shift)) {
$_ = $op;
if (/^&&$/) { return 0 if !$number; }
if (/^\|\|$/) { return 1 if $number; }
my $operand = parse_macro($next, $env);
$_ = $op;
if (/^&&$/) { $number &= $operand; next; }
if (/^\|\|$/) { $number |= $operand; next; }
ldie "bad shortcut operator '$op'";
}
return $number;
}
if (/^([~!]|not)$/) { # unary operators
my $op = $1;
$op = "!" if $op eq "not";
my $number = parse_macro($arg1, $env);
$_ = $op;
if (/^~$/) { return ~$number; }
if (/^!$/) { return !$number; }
ldie "bad unary operator '$op'";
}
# string functions
if (/^toupper$/) {
my $txt = parse_macro($arg1, $env);
return uc($txt);
}
if (/^tolower$/) {
my $txt = parse_macro($arg1, $env);
return lc($txt);
}
if (/^length$/) { # string length
my $txt = parse_macro($arg1, $env);
return length($txt);
}
if (/^subst$/) { # regex substitution operator
my $txt = parse_macro($arg1, $env);
my $arg2 = shift;
my $regex = parse_macro($arg2, $env);
my $arg3 = shift;
my $subst = parse_macro($arg3, $env);
my $opts = "m";
my $arg4 = shift;
$opts = parse_macro($arg4, $env) if defined($arg4);
# unfortunately standard regex operators don't seem to accept variable options, so we use eval()
eval("\$txt =~ s{$regex}{$subst}$opts");
return $txt;
}
if (/s?printf$/) { # sprintf()
my $fmt = $arg1; # exception: don't evaluate the format string!
my @list = ();
while (defined(my $next = shift)) {
my $operand = parse_macro($next, $env);
push @list, $operand;
}
return sprintf($fmt, @list);
}
if (/^human[-_]?numbers?$/) { # convert numbers to readable format
my $name = $_;
my $unit = parse_macro($arg1, $env);
my $arg2 = shift;
my $delim_numbers = parse_macro($arg2, $env);
$delim_numbers = "/" if $delim_numbers eq "";
my $arg3 = shift;
my $delim_unit = parse_macro($arg3, $env);
$delim_unit = " " if $delim_unit eq "";
my $max = 0;
my @list = ();
while (defined(my $next = shift)) {
my $number = make_numeric(parse_macro($next, $env));
push @list, $number;
$max = $number if $number > $max;
}
lwarn "macro $name: no number arguments given\n" unless @list;
my $conv_unit = "";
my @results = ();
foreach my $number (@list) {
my $conv_number = number2human($number, $unit, $max);
$conv_number =~ s/ *([a-z].*)//i;
$conv_unit = $1;
push @results, $conv_number;
}
return join($delim_numbers, @results) . "$delim_unit$conv_unit";
}
if (/^human[-_]?seconds$/) { # convert numbers to readable format
# don't use make_numeric() here in order to allow the result '--:--:--'
my $number = parse_macro($arg1, $env);
return seconds2human($number);
}
if (/^progress$/) { # progress bar
my $length = make_numeric(parse_macro($arg1, $env));
$length = 20 unless ($length && $length > 0);
my $arg2 = shift;
if (!defined($arg2)) { # use default values
my @vector = split(':', eval_fn($env, "summary-vector", ":"));
return progress_bar($length, @vector);
}
my $min = make_numeric(parse_macro($arg2, $env));
my $arg3 = shift;
my $mid = make_numeric(parse_macro($arg3, $env));
my $arg4 = shift;
my $max = make_numeric(parse_macro($arg4, $env));
return progress_bar($length, $min, $mid, $max);
}
if (/^errno[-_]?text$/) {
my $code = parse_macro($arg1, $env);
return "" unless defined($code) && $code != 0;
$code = -$code if $code < 0;
return __conv_errno("", $code);
}
if (/^get[-_]?log[-_]?status/) {
return get_error_text($$env{"cmd"}, $$env{"res"});
}
if (/^get[-_]?resource[-_]?(fat|err|wrn)([-_]?count)?/) {
my $what = $1;
my $do_count = $2;
my %assoc = ("fat" => 4, "err" => 3, "wrn" => 2);
my $glob = $$env{"resdir"} . "/$assoc{$what}.*.status";
return _get_text($glob, undef, 0, $do_count);
}
if (/^warn/) {
my $txt = parse_macro($arg1, $env);
lwarn $txt;
return "";
}
if (/^die$/) {
my $txt = parse_macro($arg1, $env);
ldie $txt;
return "";
}
if (/^is-module-loaded$/) {
my $path = "/proc/sys/mars";
my $result = -d $path;
return defined($result) && $result;
}
# list objects
if (/^(count[-_]?)?cluster[-_]?members$/) {
my $do_count = $1;
my @peers = lamport_glob("$mars/ips/ip-*");
return scalar(@peers) if defined($do_count);
my $list = "";
foreach my $peer (sort alphanum_cmp @peers) {
$peer =~ s:^$mars/ips/ip-::;
$list .= "$peer\n";
}
return $list;
}
if (/^(count[-_]?)?resource[-_]?members$/) {
my $do_count = $1;
my @peers = lamport_glob($$env{"resdir"} . "/data-*");
return scalar(@peers) if defined($do_count);
my $list = "";
foreach my $peer (sort alphanum_cmp @peers) {
$peer =~ s:^.*/data-::;
$list .= "$peer\n";
}
return $list;
}
if (/^(my|all)[-_]?resources$/) {
my $what = $1;
my $peer = "*";
if ($what eq "my") {
$peer = parse_macro($arg1, $env);
$peer = $$env{"host"} unless $peer;
}
my @list = lamport_glob("$mars/resource-*/data-$peer");
map { s:^$mars/resource-(.*?)/data-.*:$1:; } @list;
my $list = "";
my $old = "";
foreach my $item (sort alphanum_cmp @list) {
$list .= "$item\n" if $item ne $old;
$old = $item;
}
return $list;
}
# low-level symlink access
if (/^readlink$/) {
my $path = parse_macro($arg1, $env);
return get_link($path, 1);
}
if (/^setlink$/) {
my $src = parse_macro($arg1, $env);
my $arg2 = shift;
my $dst = parse_macro($arg2, $env);
set_link($src, $dst);
return "";
}
# high-level state access
if (/^(get|todo|actual)[-_]?primary$/) {
my $op = $1;
my $primary;
if ($op eq "actual") {
lwarn "DEPRECATED: you are trying to uniquely identify an actual primary hostname, but this is conceptually wrong because in split-brain situations there may exist multiple ones. Use view-is-primary instead. That would be safe.\n" unless $$env{cmd} =~ m/-1and1$/;
$primary = _get_actual_primary($$env{"res"});
} else {
$primary = _get_designated_primary($$env{"res"});
}
$primary = "" if (!defined($primary) || $primary eq "(none)");
$primary = ($primary eq $$env{"host"}) if $op eq "todo";
return $primary;
}
if (/^todo[-_]?(attach|sync|fetch|replay)?$/) {
my $what = $1;
$what = parse_macro($arg1, $env) unless defined($what);
my $lnk = $$env{"resdir"} . "/todo-" . $$env{"host"} . "/$what";
$lnk = correct_path($lnk);
return get_link($lnk, 1);
}
if (/^get[-_]?msg$/) {
my $what = parse_macro($arg1, $env);
my $lnk = $$env{"resdir"} . "/actual-" . $$env{"host"} . "/msg-$what";
return get_link($lnk, 1);
}
if (/^(all|the)[-_](pretty[-_]?)?(global[-_]?)?((?:err|wrn|inf)[-_])?(msg|count)$/) {
my $shorten = ($1 eq "the");
my $pretty = $2;
my $global = $3;
my $specific = $4;
my $type = $5;
$specific = "" unless defined($specific);
my $glob = "$mars";
$glob = $$env{"resdir"} if (!defined($global) && $$env{"res"});
$glob .= "/actual-" . $$env{"host"} . "/msg-$specific*";
my $result = "";
my $count = 0;
foreach my $msg_path (lamport_glob($glob)) {
my $val = get_link($msg_path, 1);
if ($shorten) {
# skip uninteresting messages
next if $val =~ m/^OK/;
# skip _transient_ error messages
if ($msg_path =~ m/-err-/ && $val =~ m/^([0-9]+\.[0-9]{9})/) {
my $stamp = $1;
if ($stamp) {
my $delta = $timeout > 0 ? $timeout : 30;
next if $stamp + $delta > time();
}
}
}
$val = _replace_timestamps($val, $shorten) if defined($pretty);
my $key = $msg_path;
$key =~ s:^.*/msg-::;
$result .= "$key: $val\n";
$count++;
}
return $count if $type eq "count";
return $result;
}
if (/^(tree|features|)[-_]?version$/) {
my $op = $1;
return get_link("$mars/$op-" . $$env{"host"}, 2);
}
if (/^is[-_]?alive$/) {
my $peer = parse_macro($arg1, $env);
$peer = _get_designated_primary($$env{"res"}) unless $peer;
$peer = $$env{"host"} unless $peer;
my $lnk = "$mars/alive-$peer";
return is_link_recent($lnk, $$env{"window"});
}
if (/^alive[-_]?timestamp$/) {
my $peer = parse_macro($arg1, $env);
$peer = _get_designated_primary($$env{"res"}) unless $peer;
return -1 if !$peer || $peer eq "(none)";
my $lnk = "$mars/alive-$peer";
return get_link_stamp($lnk);
}
if (/^is[-_]?orphan$/) {
my $peer = parse_macro($arg1, $env);
$peer = $$env{"host"} unless $peer;
return 0 if eval_fn($env, "is-primary", $peer);
my $replay = get_link($$env{"resdir"} . "/replay-$peer", 1);
$replay =~ m/^(log-[^,]+),([0-9]*)/;
my $logfile = $$env{"resdir"} . "/" . $1;
my $logpos = $2;
if (! -r $logfile) {
return 1;
}
my @stat = stat($logfile);
if (!@stat) {
return 1;
}
my $size= $stat[7];
if ($size < $logpos) {
return 1;
}
return 0;
}
if (/^is[-_]?(almost[-_]?)?consistent$/) {
my $almost = $1;
# has sync finished?
my $syncrest = make_numeric(eval_fn($env, "sync-rest", ""));
return 0 if $syncrest > 0;
if (!$almost && eval_fn($env, "is-primary", "")) {
# is the replay link indicating that something is not yet applied / dirty?
my $replay = get_link($$env{"resdir"} . "/replay-" . $$env{"host"}, 1);
$replay =~ m:,[0-9]+,([0-9]+):;
my $rest = $1;
return 0 if $rest > 0;
}
# are all logfiles applied which had accumulated during sync?
my $replay = get_link($$env{"resdir"} . "/replay-" . $$env{"host"}, 1);
return 0 unless $replay;
my $syncpos = get_link($$env{"resdir"} . "/syncpos-" . $$env{"host"}, 2);
if (defined($syncpos) && $syncpos) {
my $cmp = compare_replaylinks($syncpos, $replay);
return 0 if $cmp > 0;
}
return 1;
}
if (/^get[-_]?(disk|device)$/) {
my $what = $1;
$what = "data" if $what eq "disk";
my $lnk = $$env{"resdir"} . "/$what-" . $$env{"host"};
my $result = get_link($lnk, 1);
$result = "" unless defined($result);
$result = "/dev/mars/$result" if ($what eq "device" && $result !~ m:^/:);
return $result;
}
if (/^(disk|device)[-_]?present$/) {
my $what = $1;
$what = "data" if $what eq "disk";
my $lnk = $$env{"resdir"} . "/$what-" . $$env{"host"};
my $result = get_link($lnk, 1);
$result = "" unless defined($result);
$result = "/dev/mars/$result" if ($what eq "device" && $result !~ m:^/:);
if ($result) {
$result = -b $result;
$result = "0" unless defined($result);
}
return $result;
}
# deprecated (irregular names)
if (/^present[-_]?(disk|device)$/) {
my $what = $1;
return eval_fn($env, "$what-present", $arg1);
}
if (/^(device)[-_]?opened$/) {
my $what = $1;
my $peer = $$env{"host"};
if ($what eq "device") {
my $other = eval_fn($env, "get-device", $arg1);
if ($other =~ m/\@(.*)/) {
$peer = $1;
}
}
my $lnk = $$env{"resdir"} . "/actual-$peer/open-count";
my $result = get_link($lnk, 1);
$result = "0" unless defined($result);
return $result;
}
if (/^is[-_]?split([-_]?brain)?$/) {
my $split = detect_splitbrain($$env{"res"}, 0);
return $split ? 0 : 1;
}
if (/^is[-_]?(attach|sync|fetch|replay|primary|emergency)$/) {
my $what = $1;
my $is = "is";
$is = "has" if $what eq "emergency";
my $peer = parse_macro($arg1, $env);
$peer = $$env{"host"} unless $peer;
my $lnk = $$env{"resdir"} . "/actual-$peer/$is-$what";
$lnk = correct_path($lnk);
return get_link($lnk, 1);
}
if (/^does$/) {
my $what = parse_macro($arg1, $env);
my $is = "is";
$is = "has" if $what eq "emergency";
my $lnk = $$env{"resdir"} . "/actual-" . $$env{"host"} . "/$is-$what";
$lnk = correct_path($lnk);
return get_link($lnk, 1);
}
if (/^(tree|rest-space)$/) {
my $what = $1;
my $lnk = "$mars/$what-$host";
$lnk = correct_path($lnk);
return get_link($lnk, 1);
}
if (/^systemd[-_]?unit$/) {
return _get_systemd_unit($$env{"cmd"}, $$env{"res"});
}
if (/^(uuid)$/) {
my $what = $1;
my $lnk = "$mars/$what";
$lnk = correct_path($lnk);
return get_link($lnk, 1);
}
if (/^replay[-_]?code$/) {
my $lnk = $$env{"resdir"} . "/actual-" . $$env{"host"} . "/replay-code";
return get_link($lnk, 2);
}
if (/^(sync|fetch|replay|work)[-_]?(rate|remain)$/) {
my $what = $1;
my $select = $2;
if ($what eq "work") {
my $val1 = eval_fn($env, "fetch-$select", "");
my $val2 = eval_fn($env, "replay-$select", "");
return "" if (!defined($val1) || $val1 eq "");
return "" if (!defined($val2) || $val2 eq "");
$val1 = make_numeric($val1);
$val2 = make_numeric($val2);
return $val1 + $val2 if $select eq "remain";
# take the maximum rate
return $val1 if $val1 > $val2;
return $val2;
}
my %names =
(
"sync" => "sync_rate",
"fetch" => "file_rate",
"replay" => "replay_rate",
);
my $lnk = $$env{"resdir"} . "/actual-" . $$env{"host"} . "/" . $names{$what};
my $rate = get_link($lnk, 2);
return "" if !defined($rate) || $rate eq "" || $rate < 0;
return $rate * 1024 if $select eq "rate";
if ($select eq "remain") {
my $rest = make_numeric(eval_fn($env, "$what-rest", ""));
return 0 if $rest <= 0;
return -1 if ($rate <= 0);
return $rest / 1024 / $rate;
}
ldie "unknown macro $_\n";
}
if (/^sync[-_]?size$/) {
my $lnk = $$env{"resdir"} . "/size";
return get_link($lnk, 1);
}
if (/^sync[-_]?pos$/) {
my $lnk = $$env{"resdir"} . "/syncstatus-" . $$env{"host"};
return get_link($lnk, 1);
}
if (/^(replay)[-_]?(lognr|basenr)$/) {
my $op = $2;
my ($sum, $firstpos, $logpos, $level) = get_amount($$env{"resdir"}, $$env{"host"}, -1, 1);
return $firstpos if $op eq "lognr";
return $logpos;
}
if (/^(fetch)[-_]?(lognr)$/) {
my ($sum, $firstpos, $logpos, $level) = get_amount($$env{"resdir"}, $$env{"host"}, 1, 1);
return $logpos;
}
if (/^(work)[-_]?(lognr)$/) {
my ($sum, $firstpos, $logpos, $level) = get_amount($$env{"resdir"}, $$env{"host"}, 1, 0);
return $logpos;
}
if (/^replay[-_]?logcount$/) {
return eval_fn($env, "replay-lognr", "") - eval_fn($env, "replay-basenr", "");
}
if (/^(fetch|work)[-_]?logcount$/) {
my $what = $1;
return eval_fn($env, "$what-lognr", "") - eval_fn($env, "replay-lognr", "");
}
if (/^writeback[-_]?rest$/) {
my ($sum, $firstpos, $logpos, $level) = get_amount($$env{"resdir"}, $$env{"host"}, 1, 0);
return $sum;
}
if (/^(replay|work)[-_]?(pos)$/) {
my $what = $1;
my $op = $2;
my ($sum, $firstpos, $logpos, $level) = get_amount($$env{"resdir"}, $$env{"host"}, -1, 0);
return $sum;
}
if (/^(replay[-_]?size)|(fetch[-_]?pos)$/) {
my ($sum0) = get_amount($$env{"resdir"}, $$env{"host"}, -1, 0);
my ($sum, $firstpos, $logpos, $level) = get_amount($$env{"resdir"}, $$env{"host"}, 1, 1);
return $sum0 + $sum;
}
if (/^(fetch|work)[-_]?(size)$/) {
my $what = $1;
my ($sum0) = get_amount($$env{"resdir"}, $$env{"host"}, -1, 0);
my ($sum, $firstpos, $logpos, $level) = get_amount($$env{"resdir"}, $$env{"host"}, 1, 0);
return $sum0 + $sum;
}
if (/^(sync|fetch|replay|work)[-_]?(rest|(?:almost[-_]?|threshold[-_]?)?reached|percent|permille|vector)$/) {
my $what = $1;
my $op = $2;
my $size = make_numeric(eval_fn($env, "$what-size", ""));
my $pos = make_numeric(eval_fn($env, "$what-pos", ""));
my $result = 0;
if ($op eq "rest") {
$result = $size - $pos if $pos < $size;
} elsif ($op =~ m/^almost/) {
my $limit = make_numeric(parse_macro($arg1, $env)) if $arg1 ne "";
$limit = 990 if $limit <= 0;
$result = 1 if int($pos / $limit) >= int($size / 1000);
} elsif ($op =~ m/^threshold/) {
my $my_threshold = make_numeric($$env{"threshold"});
$arg1 = parse_macro($arg1, $env);
$my_threshold = make_numeric(get_size($arg1)) if $arg1 ne "";
$result = 1 if $pos + $my_threshold >= $size;
} elsif ($op eq "reached") {
$result = 1 if $pos >= $size;
} elsif ($op eq "percent") {
$result = 100;
$result = 100.5 * $pos / $size if $size > 0;
$result = 100 if $result >= 100.0;
} elsif ($op eq "permille") {
$result = 1000;
$result = 1000.5 * $pos / $size if $size > 0;
$result = 1000 if $result >= 1000.0;
} elsif ($op eq "vector") {
my $delim = parse_macro($arg1, $env);
$delim = ":" unless $delim;
$result = "$pos$delim$size";
} else {
ldie "unknown operation '$op'\n";
}
return $result;
}
if (/^summary[-_]?vector$/) {
my $pos1 = make_numeric(eval_fn($env, "replay-pos", ""));
my $pos2 = make_numeric(eval_fn($env, "fetch-pos", ""));
my $size = make_numeric(eval_fn($env, "fetch-size", ""));
my $delim = parse_macro($arg1, $env);
$delim = ":" unless $delim;
return "$pos1$delim$pos2$delim$size";
}
if (/^deletable[-_]?size$/) {
my ($min, $max) = _get_deletable_logfiles($_, $$env{"res"});
my $sum = 0;
foreach my $path (lamport_glob("$mars/resource-" . $$env{"res"} . "/log-*")) {
$path =~ m/\/log-([0-9]+)-/;
my $nr = $1;
next if $nr < $min or $nr >= $max;
my @stat = stat($path);
$sum += $stat[7];
}
return $sum;
}
if (/^occupied[-_]?size$/) {
my $sum = 0;
foreach my $path (lamport_glob("$mars/resource-" . $$env{"res"} . "/log-*")) {
$path =~ m/\/log-([0-9]+)-/;
my @stat = stat($path);
$sum += $stat[7];
}
return $sum;
}
if (/^(disk|resource|device)[-_]?size$/) {
my $what = $1;
my $path = "$mars/resource-" . $$env{"res"};
if ($what eq "device" && eval_fn($env, "device-present", "")) {
return _get_mars_size($$env{"cmd"}, $$env{"res"});
} elsif ($what eq "disk"){
my $peer = parse_macro($arg1, $env);
$peer = $$env{"host"} unless $peer;
$path .= "/actsize-$peer";
} else {
$path .= "/size";
}
return get_link($path, 1);
}
if (/^resource[-_]?possible[-_]?size$/) {
return get_possible_size($$env{"cmd"}, $$env{"res"});
}
# time handling and pausing
if (/^time$/) {
return mars_time();
}
if (/^real[-_]?time$/) {
my $time = time();
return $time;
}
if (/^replay[-_]?timestamp$/) {
$arg1 = parse_macro($arg1, $env);
$arg1 = $$env{"host"} unless $arg1;
my $replay = $$env{"resdir"} . "/replay-$arg1";
return get_link_stamp($replay);
}
if (/^fetch[-_]?timestamp$/) {
$arg1 = parse_macro($arg1, $env);
$arg1 = $$env{"host"} unless $arg1;
my $fetch_glob = $$env{"resdir"} . "/version-*-$arg1";
my @paths = sort alphanum_cmp lamport_glob($fetch_glob);
return -1 if !@paths;
my $fetch = pop(@paths);
return get_link_stamp($fetch);
}
if (/^work[-_]?timestamp$/) {
my $time1 = make_numeric(eval_fn($env, "fetch-timestamp", $arg1));
my $time2 = make_numeric(eval_fn($env, "replay-timestamp", $arg1));
# use the "best" of both...
return $time1 if $time1 > $time2;
return $time2;
}
if (/^(fetch|replay|work|alive)[-_]?age$/) {
my $what = $1;
my $time = make_numeric(eval_fn($env, "$what-timestamp", $arg1));
return -1 if $time <= 0;
return int(mars_time()) - $time;
}
if (/^(fetch|replay|work|alive)[-_]?lag$/) {
my $what = $1;
$arg1 = $$env{"host"} unless $arg1;
my $arg2 = shift;
my $primary = parse_macro($arg2, $env);
if (!$primary) {
$primary = _get_designated_primary($$env{"res"});
}
return -1 if !$primary || $primary eq "(none)";
my $time1 = make_numeric(eval_fn($env, "$what-timestamp", $arg1));
my $time2 = make_numeric(eval_fn($env, "$what-timestamp", $primary));
return -1 if ($time1 <= 0 || $time2 <= 0);
return 0 if $time1 < $time2;
return $time1 - $time2;
}
if (/^sleep$/) {
my $time = parse_macro($arg1, $env);
sleep($time);
return "";
}
if (/^timeout$/) {
my $time = parse_macro($arg1, $env);
sleep_timeout($time);
return "";
}
if (/^wait[-_]?((?:todo|is)[-_](?:attach|sync|fetch|replay|primary)[-_](?:on|off))$/) {
my $specific = $1;
$specific =~ s/_/-/g;
wait_cond($$env{"cmd"}, $$env{"res"}, $specific);
return "";
}
if (/^wait(?:[-_]?resource)?$/) {
my $specific = parse_macro($arg1, $env);
wait_cond($$env{"cmd"}, $$env{"res"}, $specific);
return "";
}
if (/^wait[-_]?cluster$/) {
my $specific = parse_macro($arg1, $env);
wait_cluster($$env{"cmd"}, $$env{"res"}, $specific);
return "";
}
# generic flow control and loops
if (/^(get|foreach)[-_]?glob$/) {
my $op = $1;
my $paths = parse_macro($arg1, $env);
my $arg2 = shift;
my $varname = parse_macro($arg2, $env);
my $arg3 = shift;
my @list = lamport_glob($paths);
my $result = "";
if ($op eq "get") {
my $delim = parse_macro($arg3, $env);
foreach my $path (@list) {
$result .= $delim if $result;
$result .= $path;
}
} else { # foreach
foreach my $path (@list) {
$$env{$varname} = $path;
$result .= parse_macro($arg3, $env);
}
}
return $result;
}
if (/^(if|unless)$/) {
my $op = $1;
my $cond = parse_macro($arg1, $env);
$cond = !$cond if $op eq "unless";
my $arg2 = shift;
if ($cond) {
ldie "undefined $op-part\n" unless defined($arg2);
return parse_macro($arg2, $env);
} elsif (defined(my $arg3 = shift)) {
return parse_macro($arg3, $env);
}
return "";
}
if (/^else?(if|unless)$/) {
my $op = $1;
unshift @_, $arg1;
while (defined(my $arg1 = shift)) {
if (defined(my $arg2 = shift)) {
my $cond = parse_macro($arg1, $env);
$cond = !$cond if $op eq "unless";
if ($cond) {
return parse_macro($arg2, $env);
}
} else { # odd number of arguments is treated as final "else"
return parse_macro($arg1, $env);
}
}
return "";
}
if (/^while$/) {
my $arg2 = shift;
my $result = "";
while (parse_macro($arg1, $env)) {
$result .= parse_macro($arg2, $env);
next if _control_macro($env, "__next__");
last if _control_macro($env, "__last__");
}
return $result;
}
if (/^until$/) {
my $arg2 = shift;
my $result = "";
until (parse_macro($arg1, $env)) {
$result .= parse_macro($arg2, $env);
next if _control_macro($env, "__next__");
last if _control_macro($env, "__last__");
}
return $result;
}
if (/^for$/) {
my ($arg2, $arg3, $arg4) = (shift, shift, shift);
my $result = "";
for (parse_macro($arg1, $env); parse_macro($arg2, $env); parse_macro($arg3, $env)) {
$result .= parse_macro($arg4, $env);
next if _control_macro($env, "__next__");
last if _control_macro($env, "__last__");
}
return $result;
}
if (/^foreach$/) {
my $varname = parse_macro($arg1, $env);
my $arg2 = shift;
my $txt = parse_macro($arg2, $env);
my $arg3 = shift;
my $delim = parse_macro($arg3, $env);
my $arg4 = shift;
my $result = "";
foreach my $value (split($delim, $txt)) {
$$env{$varname} = $value;
$result .= parse_macro($arg4, $env);
next if _control_macro($env, "__next__");
last if _control_macro($env, "__last__");
}
return $result;
}
if (/^protect$/) { # don't evaluate argument, take verbatim
return $arg1;
}
if (/^eval$/) { # evaluate given number of times
my $count = parse_macro($arg1, $env);
my $arg2 = shift;
while ($count-- > 0) {
$arg2 = parse_macro($arg2, $env);
}
return $arg2;
}
if (/^eval[-_]?down$/) { # evaluate until result is stable
for (;;) {
my $old = $arg1;
$arg1 = parse_macro($arg1, $env);
last if $arg1 eq $old;
}
return $arg1;
}
if (/^tmp$/) { # evaluate once in a temporary scope
my %copy_env = %$env;
my $result = parse_macro($arg1, \%copy_env);
return $result;
}
if (/^(include|call)$/) {
my $op = $1;
my $name = parse_macro($arg1, $env);
my $txt = get_macro($name);
if ($op eq "call") { # run in new sub-scope
my %copy_env = %$env;
set_args($env, \%copy_env, $name, @_);
return parse_macro($txt, \%copy_env);
}
# 'include' runs in the same scope
set_args($env, $env, $name, @_);
return parse_macro($txt, $env);
}
if (/^callstack$/) {
return $$env{"callstack"};
}
if (/^(abort|return|stop-eval)$/) {
my $op = $1;
$$env{$op} = 1;
return "";
}
if (/^(next|last)$/) {
my $op = $1;
$$env{$op} = 1;
$$env{"__return__"} = 1;
return "";
}
ldie "call to unknown macro '$fn'\n";
}
}
##################################################################
# macro parsing
sub _control_macro {
my $env = shift;
my $control = shift;
my $result = $$env{$control};
$$env{$control} = 0;
return $result;
}
sub parse_macro {
my ($text, $env) = @_;
$text = "" unless defined($text);
my $old_callstack = $$env{"callstack"};
my $result = "";
while ($text =~ m/\\(.)|$match_fn/sp) {
my $escape = $1;
my $fn = $2;
my @args = ($3);
my $pre = $PREMATCH;
my $post = $POSTMATCH;
if (defined($escape)) {
$result .= $pre;
$text = $post;
$_ = $escape;
if (/[tnrfbae]/) {
eval "\$result .= \"\\$escape\"";
next;
}
if (/[a-zA-Z]/) {
lwarn "control sequence '\\$escape' is reserved for future use\n";
}
$result .= $escape;
next;
}
return "" if _control_macro($env, "__abort__");
return $result if _control_macro($env, "__return__");
return $result . $text if _control_macro($env, "__stop_eval__");
$result .= $pre;
$text = $post;
while ($text =~ m/\A\{($match_inner)\}/sp ) {
push @args, $1;
$text = $POSTMATCH;
}
my $new = eval_fn($env, $fn, (@args));
ldie "undefined result from evaluation of primitive macro '$fn'\n" unless defined($new);
$$env{"callstack"} = $old_callstack;
$result .= $new;
}
return "" if _control_macro($env, "__abort__");
return $result if _control_macro($env, "__return__");
return $result . $text if _control_macro($env, "__stop_eval__");
return $result . $text;
}
sub make_env {
my ($cmd, $res, $text) = (shift, shift, shift);
$text =~ s{$match_comment}{}sg;
my %start_env =
(
"cmd" => $cmd,
"res" => $res,
"resdir" => "$mars/resource-$res",
"mars" => $mars,
"host" => $host,
"real_host" => $real_host,
"ip" => $ip,
"timeout" => $timeout,
"threshold" => $threshold,
"window" => $window,
"force" => $force,
"dry-run" => $dry_run,
"verbose" => $verbose,
"callstack" => "",
# internal, deliberately not documented
"__abort__" => 0,
"__return__" => 0,
"__stop_eval__" => 0,
"__next__" => 0,
"__last__" => 0,
);
set_args(\%start_env, \%start_env, $cmd, @_);
return ($text, \%start_env);
}
sub eval_macro {
my ($text, $start_env) = make_env(@_);
return parse_macro($text, $start_env);
}
##################################################################
# macro commands
my $macro = "";
my %complex_macros =
(
"default"
=> "%if{%{res}}{"
. " %{res} [%count-resource-members{%{res}}/%count-cluster-members{}]"
. " %include{diskstate} %include{replstate} %include{flags} %include{role} %include{primarynode} %include{commstate}\n"
. "%if{%>{%-{%disk-size{}}{%resource-size{}}}{%{threshold}}}{"
. " Hint: you are wasting %human-numbers{}{ }{ }{%-{%disk-size{}}{%resource-size{}}} on disk %get-disk{}\n"
. "}"
. "%if{%>{%resource-possible-size{}}{%resource-size{}}}{"
. " Hint: marsadm resize would increase resource %{res} by %human-numbers{}{ }{ }{%-{%resource-possible-size{}}{%resource-size{}}}\n"
. "}"
. "%if{%and{%not{%is-primary{}}}{%is-attach{}}}{"
. "%if{%not{%sync-reached{}}}{"
. "%include{syncinfo}"
. "}"
. "%if{%not{%work-reached{}}}{"
. "%include{replinfo}"
. "}"
. "}"
. "%call{resource-errors}"
. "}",
"default-resource"
=> "%if{%{res}}{"
. "%{res} %human-numbers{}{ }{ }{%resource-size{}} [%count-resource-members{%{res}}/%count-cluster-members{}]"
. "}",
"default-global"
=> "%call{comminfo}",
"1and1"
=> "%if{%{res}}{"
. " %{res} %include{diskstate-1and1} %include{replstate-1and1} %include{flags-1and1} %include{role-1and1} %include{primarynode-1and1}\n"
. "%if{%and{%is-attach{}}{%not{%sync-reached{}}}}{"
. "%include{syncinfo-1and1}"
. "}"
. "%if{%and{%is-attach{}}{%not{%work-threshold-reached{}}}}{"
. "%include{replinfo-1and1}"
. "}"
. "%call{resource-errors-1and1}"
. "}{"
. "%the-pretty-global-msg{}"
. "}",
"diskstate"
=> "%elsif{%not{%get-disk{}}}{"
. "NotJoined"
. "}{%not{%disk-present{}}}{"
. "NotPresent"
. "}{%not{%todo-attach{}}}{"
. "%if{%is-attach{}}{"
. "Detaching"
. "}{"
. "Detached"
. "}"
. "}{%and{%replay-code{}}{%<{%replay-code{}}{0}}}{"
. "DefectiveLog[%errno-text{%replay-code{}}]"
. "}{%is-orphan{}}{"
. "Orphan"
. "}{%not{%is-attach{}}}{"
. "NoAttach"
. "}{%not{%is-consistent{}}}{"
. "%elsif{%not{%todo-primary{}}}{"
. "InConsistent"
. "}{%is-primary{}}{"
. "WriteBack[%human-numbers{}{ }{ }{%writeback-rest{}}]"
. "}{"
. "Recovery"
. "}"
. "}{%not{%work-reached{}}}{"
. "%elsif{%not{%todo-primary{}}}{"
. "OutDated[%call{outdated-flags}]"
. "}{%is-primary{}}{"
. "WriteBack[%human-numbers{}{ }{ }{%writeback-rest{}}]"
. "}{"
. "Recovery"
. "}"
. "}{%and{%ne{%get-primary{}}{}}{%tmp{%let{host}{%get-primary{}}%is-emergency{}}}}{"
. "EmergencyMode"
. "}{"
. "UpToDate"
. "}",
"diskstate-1and1"
=> "%if{%disk-present{}}{"
. "%if{%does{attach}}{"
. "%if{%is-almost-consistent{}}{"
. "%if{%work-reached{}}{"
. "Uptodate"
. "}{"
. "Outdated[%call{outdated-flags-1and1}]"
. "}"
. "}{Inconsistent}"
. "}{Detached}"
. "}{Detached}",
"outdated-flags"
=> "%if{%fetch-reached{}}{}{F}%if{%replay-reached{}}{}{R}",
"outdated-flags-1and1"
=> "%if{%fetch-reached{}}{}{F}%if{%replay-reached{}}{}{R}",
"replstate"
=> "%elsif{%not{%is-module-loaded{}}}{"
. "ModuleNotLoaded"
. "}{%not{%is-alive{%{host}}}}{"
. "UnResponsive"
. "}{%not{%get-disk{}}}{"
. "NotJoined"
. "}{%not{%todo-attach{}}}{"
. "NotStarted"
. "}{%todo-primary{}}{" # I am designated primary
. "%elsif{%is-emergency{}}{"
. "EmergencyMode"
. "}{%is-primary{}}{"
. "Replicating"
. "}{"
. "NotYetPrimary"
. "}"
. "}{" # Secondary
. "%elsif{%and{%sync-rest{}}{%not{%todo{sync}}}}{"
. "PausedSync"
. "}{%is-sync{}}{"
. "Syncing"
. "}{%not{%todo{fetch}}}{"
. "PausedFetch"
. "}{%not{%todo{replay}}}{"
. "PausedReplay"
. "}{%not{%get-primary{}}}{"
. "NoPrimaryDesignated"
. "}{%not{%is-alive{}}}{"
. "PrimaryUnreachable"
. "}{%is-orphan{}}{"
. "Orphan"
. "}{"
. "Replaying"
. "}"
. "}",
"replstate-1and1"
=> "%if{%disk-present{}}{"
. "%if{%is-primary{}}{"
. "Replicating"
. "}{"
. "%if{%is-alive{}}{"
. "%if{%and{%not{%sync-reached{}}}{%not{%todo{sync}}}}{"
. "PausedSync"
. "}{"
. "%if{%does{sync}}{"
. "Syncing"
. "}{"
. "%unless{%and{%todo-fetch{}}{%todo-replay{}}}{"
. "PausedReplay"
. "}{Replaying}"
. "}"
. "}"
. "}{PrimaryUnreachable}"
. "}"
. "}{NotJoined}",
"flags"
=> "%if{%disk-present{}}{%if{%device-present{}}{D}{d}}{-}"
. "%if{%is-consistent{}}{C}{%if{%disk-present{}}{c}{-}}"
. "%if{%does{attach}}{%if{%todo{attach}}{A}{a}}{%if{%todo{attach}}{a}{-}}"
. "%if{%sync-reached{}}{S}{%if{%todo{sync}}{s}{-}}"
. "%if{%fetch-reached{}}{F}{%if{%todo{fetch}}{f}{-}}"
. "%if{%replay-reached{}}{R}{%if{%todo{replay}}{r}{-}}",
"flags-1and1"
=> "-%if{%todo{sync}}{S}{-}%if{%todo{fetch}}{F}{-}%if{%todo{replay}}{R}{-}-",
"todo-role"
=> "%if{%disk-present{}}{"
. "%if{%todo-primary{}}{"
. "Primary"
. "}{"
. "Secondary"
. "}"
. "}{None}",
"role"
=> "%if{%disk-present{}}{"
. "%if{%todo-primary{}}{"
. "%if{%is-primary{}}{"
. "Primary"
. "}{"
. "NotYetPrimary"
. "}"
. "}{"
. "%if{%is-primary{}}{"
. "RemainsPrimary"
. "}{"
. "Secondary"
. "}"
. "}"
. "}{None}",
"role-1and1"
=> "%if{%disk-present{}}{"
. "%if{%is-primary{}}{"
. "Primary"
. "}{"
. "Secondary"
. "}"
. "}{Secondary}",
"primarynode"
=> "%if{%todo-primary{}}{"
. "%{host}"
. "}{"
. "%get-primary{}"
. "}",
"primarynode-1and1"
=> "%if{%disk-present{}}{"
. "%if{%is-primary{}}{"
. "%{host}"
. "}{"
. "%if{%actual-primary{}}{"
. "%actual-primary{}"
. "}{-}"
. "}"
. "}{-}",
"commstate"
=> "%let{comm}{%alive-age{}}"
. "%if{%>={%{comm}}{%{window}}}{"
. "%human-seconds{%{comm}}"
. "}",
"syncinfo"
=> "%let{amount}{%human-numbers{}{ }{ }{%sync-pos{}}{%sync-size{}}}"
. "%let{rate}{%human-numbers{}{ }{ }{%sync-rate{}}}"
. "%sprintf{ syncing: %s %.2f%% (%d/%d)%s rate: %.2f %s/sec remaining: %s hrs\n}"
. "{%progress{20}{%sync-pos{}}{0}{%sync-size{}}}"
. "{%sync-percent{}}"
. "{%{amount}{ }{0}}"
. "{%{amount}{ }{1}}"
. "{%{amount}{ }{2}}"
. "{%{rate}{ }{0}}"
. "{%{rate}{ }{1}}"
. "{%human-seconds{%sync-remain{}}}"
. "%call{sync-line}",
"syncinfo-1and1"
=> "%let{amount}{%human-numbers{}{ }{ }{%sync-pos{}}{%sync-size{}}}"
. "%let{rate}{%human-numbers{}{ }{ }{%sync-rate{}}}"
. "%sprintf{ syncing: %s %.2f%% (%d/%d)%s rate: %.2f %s/sec remaining: %s hrs\n}"
. "{%progress{20}{%sync-pos{}}{0}{%sync-size{}}}"
. "{%sync-percent{}}"
. "{%{amount}{ }{0}}"
. "{%{amount}{ }{1}}"
. "{%{amount}{ }{2}}"
. "{%{rate}{ }{0}}"
. "{%{rate}{ }{1}}"
. "{%human-seconds{%sync-remain{}}}"
. "%call{sync-line-1and1}",
"replinfo"
=> "%let{amount}{%human-numbers{}{ }{ }{%replay-pos{}}{%fetch-size{}}}"
. "%let{logs}{%replay-lognr{}}"
. "%let{l2}{%fetch-lognr{}}"
. "%let{l3}{%work-lognr{}}"
. "%if{%>{%{l2}}{%{logs}}}{%append{logs}{::%{l2}}}"
. "%if{%>{%{l3}}{%{l2}}}{%append{logs}{..%{l3}}}"
. "%sprintf{ replaying: %s %.2f%% (%d/%d)%s logs: [%s]\n}"
. "{%progress{20}{%replay-pos{}}{%fetch-pos{}}{%fetch-size{}}}"
. "{%work-percent{}}"
. "{%{amount}{ }{0}}"
. "{%{amount}{ }{1}}"
. "{%{amount}{ }{2}}"
. "{%{logs}}"
. "%call{fetch-line}"
. "%call{replay-line}",
"replinfo-1and1"
=> "%let{amount}{%human-numbers{}{ }{ }{%replay-pos{}}{%fetch-size{}}}"
. "%sprintf{ replaying: %s %.2f%% (%d/%d)%s logs: [%d..%d]\n}"
. "{%progress{20}{%replay-pos{}}{%fetch-pos{}}{%fetch-size{}}}"
. "{%work-percent{}}"
. "{%{amount}{ }{0}}"
. "{%{amount}{ }{1}}"
. "{%{amount}{ }{2}}"
. "{%replay-lognr{}}"
. "{%fetch-lognr{}}"
. "%call{fetch-line-1and1}"
. "%call{replay-line-1and1}",
"sync-line"
=> "%let{amount}{%human-numbers{}{}{}{%sync-pos{}}{%sync-size{}}}"
. "%let{rate}{%human-numbers{}{}{}{%sync-rate{}}}"
. "%let{remain}{%human-seconds{%sync-remain{}}}"
. " > sync: %{amount} rate: %{rate}/sec remaining: %{remain} hrs\n",
"sync-line-1and1"
=> "%let{amount}{%human-numbers{}{}{}{%sync-pos{}}{%sync-size{}}}"
. "%let{rate}{%human-numbers{}{}{}{%sync-rate{}}}"
. "%let{remain}{%human-seconds{%sync-remain{}}}"
. " > sync: %{amount} rate: %{rate}/sec remaining: %{remain} hrs\n",
"fetch-line"
=> "%let{amount}{%human-numbers{}{}{}{%fetch-rest{}}}"
. "%let{rate}{%human-numbers{}{}{}{%fetch-rate{}}}"
. "%let{remain}{%human-seconds{%fetch-remain{}}}"
. "%let{age}{%if{%and{%fetch-remain{}}{%>={%fetch-age{}}{%{window}}}}{ age: %human-seconds{%fetch-age{}}}}"
. "%let{lag}{%if{%and{%fetch-remain{}}{%>={%fetch-lag{}}{%{window}}}}{ lag: %human-seconds{%fetch-lag{}}}}"
. " > fetch: %{amount}%{age}%{lag} rate: %{rate}/s remaining: %{remain}\n",
"fetch-line-1and1"
=> "%let{amount}{%human-numbers{}{}{}{%fetch-rest{}}}"
. "%let{rate}{%human-numbers{}{}{}{%fetch-rate{}}}"
. "%let{remain}{%human-seconds{%fetch-remain{}}}"
. " > fetch: %{amount} rate: %{rate}/sec remaining: %{remain} hrs\n",
"replay-line"
=> "%let{amount}{%human-numbers{}{}{}{%replay-rest{}}}"
. "%let{rate}{%human-numbers{}{}{}{%replay-rate{}}}"
. "%let{remain}{%human-seconds{%replay-remain{}}}"
. "%let{age}{%if{%and{%replay-remain{}}{%>={%replay-age{}}{%{window}}}}{ age: %human-seconds{%replay-age{}}}}"
. " > replay: %{amount}%{age} rate: %{rate}/s remaining: %{remain}\n",
"replay-line-1and1"
=> "%let{amount}{%human-numbers{}{}{}{%replay-rest{}}}"
. "%let{rate}{%human-numbers{}{}{}{%replay-rate{}}}"
. "%let{remain}{%human-seconds{%replay-remain{}}}"
. " > replay: %{amount} rate: %{rate}/sec remaining: %{remain} hrs\n",
"resource-errors"
=> "%let{fat-count}{%get-resource-fat-count{}}"
. "%if{%{fat-count}}{"
. "FATALS FILE (%{fat-count})"
. "%if{%{verbose}}{"
. ":\n%get-resource-fat{}"
. "}{"
. ": available with --verbose\n"
. "}"
. "}"
. "%let{errs}{%the-err-count{}}"
. "%if{%{errs}}{"
. "ERRORS LNK (%{errs}):\n"
. "%the-pretty-err-msg{}"
. "}"
. "%let{err-count}{%get-resource-err-count{}}"
. "%if{%{err-count}}{"
. "ERRORS FILE (%{err-count})"
. "%if{%{verbose}}{"
. ":\n%get-resource-err{}"
. "}{"
. ": available with --verbose\n"
. "}"
. "}"
. "%let{wrns}{%the-wrn-count{}}"
. "%if{%{wrns}}{"
. "WARNINGS LNK (%{wrns}):\n"
. "%the-pretty-wrn-msg{}"
. "}"
. "%let{wrn-count}{%get-resource-wrn-count{}}"
. "%if{%and{%{verbose}}{%{wrn-count}}}{"
. "WARNINGS FILE (%{wrn-count})"
. "%if{%{verbose}}{"
. ":\n%get-resource-wrn{}"
. "}{"
. ": available with --verbose\n"
. "}"
. "}"
. "%let{infs}{%the-inf-count{}}"
. "%if{%and{%{verbose}}{%{infs}}}{"
. "INFOS LNK (%{infs}):\n"
. "%the-pretty-inf-msg{}"
. "}"
. "%let{status_msg}{%get-log-status-{}}"
. "%if{%and{%{verbose}}{%{status_msg}}}{"
. "STATUS FILE:\n%{status_msg}"
. "}",
"resource-errors-1and1"
=> "%let{fat-count}{%get-resource-fat-count{}}"
. "%if{%{fat-count}}{"
. "FATALS FILE (%{fat-count}):\n"
. "%get-resource-fat{}"
. "}"
. "%let{errs}{%the-err-count{}}"
. "%if{%{errs}}{"
. "ERRORS LNK (%{errs}):\n"
. "%the-pretty-err-msg{}"
. "}"
. "%let{err-count}{%get-resource-err-count{}}"
. "%if{%{err-count}}{"
. "ERRORS FILE (%{err-count}):\n"
. "%get-resource-err{}"
. "}"
. "%let{wrns}{%the-wrn-count{}}"
. "%if{%{wrns}}{"
. "WARNINGS LNK (%{wrns}):\n"
. "%the-pretty-wrn-msg{}"
. "}"
. "%let{wrn-count}{%get-resource-wrn-count{}}"
. "%if{%and{%{verbose}}{%{wrn-count}}}{"
. "WARNINGS FILE (%{wrn-count})"
. "%if{%{verbose}}{"
. ":\n%get-resource-wrn{}"
. "}{"
. ": available with --verbose\n"
. "}"
. "}"
. "%let{infs}{%the-inf-count{}}"
. "%if{%and{%{verbose}}{%{infs}}}{"
. "INFOS LNK (%{infs}):\n"
. "%the-pretty-inf-msg{}"
. "}"
. "%let{status_msg}{%get-log-status-{}}"
. "%if{%and{%{verbose}}{%{status_msg}}}{"
. "STATUS FILE:\n%{status_msg}"
. "}",
"comminfo"
=> "%let{txt}{%the-pretty-global-msg{}}"
. "%let{txt}{%subst{%{txt}}{^.*CONNECTED.*\$\n*}{}{mg}}"
. "%if{%{txt}}{"
. "++++++++++++ Global / Communication Status ++++++++++++\n"
. "%{txt}"
. "}"
. "%let{timeslip}{%-{%time{}}{%real-time{}}}"
. "%if{%>{%{timeslip}}{%{window}}}{"
. "++++++++++++ Lamport time slip: %human-seconds{%{timeslip}} ++++++++++++\n"
. "}",
# drbd similar ones
"state"
=> "NYI Please override macro \\%%{0}\\{\\}",
"cstate"
=> "NYI Please override macro \\%%{0}\\{\\}",
"dstate"
=> "NYI Please override macro \\%%{0}\\{\\}",
"status"
=> "NYI Please override macro \\%%{0}\\{\\}",
);
my %view_macros = %complex_macros;
# add some trivial macros to the command line interface
# FIXME: only at most one argument is allowed for now.
my %trivial_globs =
(
# intended for human use
"{all,the}-{pretty-,}{global-,}{{err,wrn,inf}-,}msg"
=> "",
"{is,todo}-{attach,sync,fetch,replay,primary}"
=> "",
"is-{split-brain,consistent,emergency,orphan}"
=> "",
"rest-space"
=> "",
"get-{disk,device}"
=> "",
"present-{disk,device}"
=> "(deprecated, use *-present instead)",
"{disk,device}-present"
=> "",
"device-opened"
=> "",
"get-log-status"
=> "",
"get-resource-{fat,err,wrn}{,-count}"
=> "",
# intended for scripting
"{my,all}-resources"
=> "",
"{cluster,resource}-members"
=> "",
"count-{cluster,resource}-members"
=> "",
"{disk,resource,device}-size"
=> "",
"resource-possible-size"
=> "",
"deletable-size"
=> "",
"occupied-size"
=> "",
"replay-code"
=> "When negative, this indidates that a replay/recovery error has occurred.",
"errno-text"
=> "Convert errno numbers (positive or negative) into human readable text.",
"{sync,fetch,replay,work,syncpos}-{size,pos}"
=> "",
"{sync,fetch,replay,work}-{rest,{almost-,threshold-,}reached,percent,permille,vector}"
=> "",
"writeback-rest"
=> "",
"{sync,fetch,replay}-{rate,remain}"
=> "",
"replay-basenr"
=> "",
"{fetch,replay,work}-{lognr,logcount}"
=> "",
"summary-vector",
=> "",
"{time,real-time}"
=> "",
"{alive,fetch,replay,work}-{timestamp,age,lag}"
=> "",
"{tree,features}-version"
=> "",
"{get,actual}-primary"
=> "",
"is-{alive}"
=> "",
"systemd-unit"
=> "",
"uuid"
=> "",
"tree"
=> "",
"wait-{is,todo}-{attach,sync,fetch,replay,primary}-{on,off}"
=> "",
);
my $glob = "";
foreach my $new_glob (sort alphanum_cmp keys(%trivial_globs)) {
$glob .= "," if $glob;
$glob .= $new_glob;
}
foreach my $name (lamport_glob("{$glob}")) {
$view_macros{$name} = "\%primitive-$name\{\%\{1}}";
}
sub _get_pre {
my ($rest, $add) = @_;
$rest =~ s{(\A.*\n)}{}sp;
$add = 0 if $1;
return length($rest) + $add;
}
sub _break_line {
my ($result, $add, $indent) = @_;
my $pre_len = _get_pre($result, $add);
if ($pre_len != $indent) {
$result .= "\\\n" . ' ' x $indent;
}
return $result;
}
sub _pretty_macro {
my ($text, $add, $indent) = @_;
$text =~ s/\\n/\n/gs;
my $result = "";
# look for function calls
while ($text =~ m/^($match_fn_head(?:{})*)/mp) {
$result .= $PREMATCH;
my $fn = $1;
$text = $POSTMATCH;
$result = _break_line($result, $add, $indent);
$add = 0;
$result .= $fn;
while ($text =~ m/\A\{/sp) {
# don't break simple / non-recursive / unbreakable arguments
if ($text =~ m/\A(\{(?:\s$match_inner|[^%{}]*)\})/sp) {
my $shortcut = $1;
$text = $POSTMATCH;
# make newlines non-verbatim
$shortcut =~ s{\n}{\\n}spg;
$result .= $shortcut;
next;
}
# break more complex arguments
#$result .= "{\\\n" . ' ' x $indent;
$result .= "{";
if ($text =~ m/\A\{([\s]$match_inner|)\}/sp) {
$result .= "$1}";
$text = $POSTMATCH;
} elsif ($text =~ m/\A\{([^\s]$match_inner)\}/sp ) {
my $arg = $1;
$text = $POSTMATCH;
my $sub_add = _get_pre($result, 0);
my $subst = _pretty_macro($arg, $sub_add, $indent + 2);
$result .= $subst;
$result = _break_line($result, 0, $indent);
$result .= "}";
} else {
ldie "wtf '$text'?\n";
}
}
}
$result .= $text;
return $result;
}
sub pretty_macro {
my $txt = _pretty_macro(@_);
# always add a trailing newline (for vi ;)
$txt .= "\\\n" unless $txt =~ m{\n\Z}sp;
return $txt;
}
get_global_versions();
if (defined($ARGV[0]) && $ARGV[0] =~ m/^dump-(all-)?macros$/) {
my %macros = %complex_macros;
%macros = %view_macros if defined($1);
foreach my $name (keys %macros) {
my $txt = $macros{$name};
$txt = pretty_macro($txt, 0, 0);
my $file = "$name.tpl";
if (-r $file) { # backup already existing files
for (my $i = 0; ; $i++) {
my $file_old = "$file.old$i";
unless (-r $file_old) {
rename($file, $file_old);
last;
}
}
}
open(OUT, ">", $file) or ldie "cannot create file '$file'\n";
print OUT $txt;
close(OUT);
}
exit(0);
}
sub get_macro {
my ($cmd, $tolerate) = @_;
if ($macro) {
my $orig = $macro;
$macro = ""; # consume once
return $orig;
}
foreach my $path (@MARS_PATH) {
my $file = "$path/$cmd.tpl";
if (-r $file) {
lprint_stderr "using macro file '$file'\n" if (defined($view_macros{$cmd}));
local $/; # slurp
open(IN, "<", $file) or next;
my $tpl = <IN>;
close(IN);
$tpl =~ s{$match_comment}{}sg;
$tpl =~ s{\\n}{\n}sg;
return $tpl;
}
}
return $view_macros{$cmd} if (defined($view_macros{$cmd}));
ldie "cannot find macro '$cmd'\n" unless defined($tolerate);
return "";
}
sub view_cmd {
my ($cmd, $res) = (shift, shift);
if ($cmd =~ s/^prettyprint-//) {
my $txt = get_macro($cmd);
print pretty_macro($txt, 0);
return;
}
$cmd =~ s/^view-?//;
$cmd = "default" unless $cmd;
my $tpl = get_macro($cmd);
my $result = eval_macro($cmd, $res, $tpl, @_);
if ($result ne "") { # add trailing newline if none exists
chomp $result;
$result .= "\n";
}
print $result;
}
##################################################################
# command table of all commands
my %cmd_table =
(
# new keywords
"create-uuid"
=> [
"verbose 3",
"usage: create-uuid (no parameters)",
"Deprecated.",
"This is only needed if you have a very old $mars/",
"directory structure from MARS version light0.1beta05",
"or earlier.",
\&create_uuid,
],
"create-cluster"
=> [
"usage: create-cluster (no parameters)",
"This must be called exactly once when creating a new cluster.",
"Don't call this again! Use join-cluster on the secondary nodes.",
"Please read the PDF manual for details.",
\&create_cluster,
],
"join-cluster"
=> [
"usage: join-cluster <hostname_of_primary>",
"Establishes a new cluster membership.",
"This must be called once on any new cluster member.",
"This is a prerequisite for join-resource.",
\&join_cluster,
],
"merge-cluster"
=> [
"usage: merge-cluster <hostname_of_other_cluster>",
"Precondition: the resource names of both clusters must be disjoint.",
"Create the union of two clusters, consisting of the",
"union of all machines, and the union of all resources.",
"The members of each resource are _not_ changed by this.",
"This is useful for creating a big \"virtual LVM cluster\" where",
"resources can be almost arbitrarily migrated between machines via",
"later join-resource / leave-resource operations.",
\&merge_cluster,
],
"merge-cluster-list"
=> [
"usage: merge-cluster-list",
"Determine the local list of resources.",
"Useful for checking or analysis of merge-cluster disjointness by hand.",
\&merge_cluster,
],
"merge-cluster-check"
=> [
"usage: merge-cluster-check <hostname_of_other_cluster>",
"Check whether the resources of both clusters are disjoint.",
"Useful for checking in advance whether merge-cluster would be",
"possible.",
\&merge_cluster,
],
"split-cluster"
=> [
"usage: split-cluster (no parameters)",
"NOT OFFICIALLY SUPPORTED - ONLY FOR EXPERTS.",
"RTFS = Read The Fucking Sourcecode.",
"Use this only if you know what you are doing.",
\&split_cluster,
],
"leave-cluster"
=> [
"usage: leave-cluster (no parameters)",
"This can be used for final deconstruction of a cluster member.",
"Prior to this, all resources must have been left",
"via leave-resource.",
"Notice: this will never destroy the cluster UID on the /mars/",
"filesystem.",
"Please read the PDF manual for details.",
\&leave_cluster,
],
"create-resource"
=> [
"usage: create-resource <resource_name> </dev/lv/mydata>",
"(further syntax variants are described in the PDF manual).",
"Create a new resource out of a pre-existing disk (backing",
"block device) /dev/lv/mydata (or similar).",
"The current node will start in primary role, thus",
"/dev/mars/<resource_name> will appear after a short time, initially",
"showing the same contents as the underlying disk /dev/lv/mydata.",
"It is good practice to name the resource <resource_name> and the",
"disk name identical.",
\&create_res,
],
"join-resource"
=> [
"usage: join-resource <resource_name> </dev/lv/mydata>",
"(further syntax variants are described in the PDF manual).",
"The resource <resource_name> must have been already created on",
"another cluster node, and the network must be healthy.",
"The contents of the local replica disk /dev/lv/mydata will be",
"overwritten by the initial fast full sync from the currently",
"designated primary node.",
"After the initial full sync has finished, the current host will",
"act in secondary role.",
"For details on size constraints etc, refer to the PDF manual.",
\&create_res,
],
"leave-resource"
=> [
"Precondition: the local host must be in secondary role.",
"Stop being a member of the resource, and thus stop all",
"replication activities. The status of the underlying disk",
"will remain in its current state (whatever it is).",
\&leave_res_phase0,
"check preconditions",
\&leave_res_phase1,
"switch state",
\&leave_res_phase2,
"purge logfiles",
\&leave_res_phase3,
"wait for deletions",
],
"delete-resource"
=> [
"CAUTION! This is dangerous when the network is somehow",
"interrupted, or when damaged nodes are later re-surrected",
"in any way.\n",
"Precondition: the resource must no longer have any members",
"(see leave-resource).",
"This is only needed when you _insist_ on re-using a damaged",
"resource for re-creating a new one with exactly the same",
"old <resource_name>.",
"HINT: best practice is to not use this, but just create a _new_",
"resource with a new <resource_name> out of your local disks.",
"Please read the PDF manual on potential consequences.",
\&delete_res,
],
"set-systemd-unit"
=> [
"usage: set-systemd-unit <resource_name> <start_unit_name> [<stop_unit_name>]",
"This activates the systemd template engine of marsadm.",
"Please read mars-user-manual.pdf on this.",
"When <stop_unit_name> is omitted, it will be treated equal to",
"<start_unit_name>.",
\&set_systemd_unit,
],
"get-systemd-unit"
=> [
"usage: get-systemd-unit <resource_name>",
"Show the system units (for start and stop), or empty when unset.",
\&get_systemd_unit,
],
"set-systemd-want"
=> [
"usage: set-systemd-want <resource_name> <host_name>",
"Override the current location where the complete systemd unit stack",
"should be started.",
"Useful for a _temporary_ stop of the systemd unit stack by supplying",
"the special hostname \"(none)\".",
"For a _permanent_ stop, use \"marsadm set-systemd-unit <resource>\"",
"instead.",
\&set_systemd_want,
],
"get-systemd-want"
=> [
"usage: get-systemd-want <resource_name>",
"Show the current hostname where the complete systemd unit stack",
"between start- and stop-unit should appear.",
"Reports empty when unset, or \"(none)\" when stopped.",
\&get_systemd_want,
],
"set-connect-pref-list"
=> [
"verbose 2",
"usage: set-connect-pref-list <resource_name> <host_list>",
"Provisionary command for internal use at 1&1. Will be replaced by",
"a better concept somewhen. The <host_list> must be comma-separated.",
\&set_connect_pref_list,
],
"get-connect-pref-list"
=> [
"verbose 2",
"Provisionary command for internal use at 1&1. Will be replaced by",
"a better concept somewhen.",
"Shows the outcome of set-connect-pref-list.",
\&set_connect_pref_list,
],
"log-rotate"
=> [
"Only useful at the primary side.",
"Start writing transaction logs into a new transaction logfile.",
"This should be regularly called by a cron job or similar.",
"For regular maintainance cron jobs, please prefer 'marsadm cron'.",
"For details and best practices, please refer to the PDF manual.",
\&logrotate_res,
],
"log-delete-one"
=> [
"When possible, globally delete at most one old transaction logfile",
"which is known to be superfluous, i.e. all secondaries no longer",
"need to replay it.",
"Hint: use this only for testing and manual inspection.",
"For regular maintainance cron jobs, please prefer cron",
"or log-delete-all.",
\&logdelete_res,
],
"log-delete"
=> [
"When possible, globally delete all old transaction logfiles which",
"are known to be superflous, i.e. all secondaries no longer need",
"to replay them.",
"This must be regularly called by a cron job or similar, in order",
"to prevent overflow of the /mars/ directory.",
"For regular maintainance cron jobs, please prefer 'marsadm cron'.",
"For details and best practices, please refer to the PDF manual.",
\&logdelete_res,
],
"log-delete-all"
=> [
"Alias for log-delete",
\&logdelete_res,
],
"cron"
=> [
"usage: cron (no parameters)",
"Do all necessary regular housekeeping tasks.",
"This is equivalent to log-rotate all; sleep 5; log-delete-all all.",
\&cron_res,
],
"log-purge-all"
=> [
"This is potentially dangerous.",
"Use this only if you are really desperate in trying to resolve a",
"split brain. Use this only after reading the PDF manual!",
\&log_purge_res,
],
"err-purge-all"
=> [
"Remove any err message from the given resources.",
\&err_purge_res,
],
"link-purge-all"
=> [
"Remove any .deleted links.",
\&link_purge_res,
],
"fake-sync"
=> [
"verbose 1",
"Attention: this is potentially dangerous.",
"Only for experts.",
"Please read the PDF manual to understand the risks!",
\&fake_sync_phase1,
"switch sync off",
\&fake_sync_phase2,
"wait for sync off",
\&fake_sync_phase3,
"fake sync",
],
"set-link"
=> [
"verbose 1",
"usage: set-link <path> <value>",
"Only for experts.",
"Will disappear in a future MARS release.",
\&set_link_cmd,
],
"get-link"
=> [
"verbose 1",
"usage: get-link <path>",
"Only for experts.",
"Will disappear in a future MARS release.",
\&set_link_cmd,
],
"set-sync-limit-value"
=> [
"usage: set-sync-limit-value <new_value>",
"Set the maximum number of resources which should by syncing",
"concurrently.",
\&set_sync_limit_value,
],
"get-sync-limit-value"
=> [
"usage: get-sync-limit-value (no parameters)",
"For retrieval of the value set by set-sync-limit-value.",
\&set_sync_limit_value,
],
"delete-file"
=> [
"verbose 1",
"usage: delete-file <path>",
"VERY dangerous!",
"Only for experts.",
\&delete_file_cmd,
],
"set-emergency-limit"
=> [
"usage: set-emergency-limit <resource_name> <value>",
"Set a per-resource emergency limit for disk space in /mars.",
"See PDF manual for details.",
\&emergency_limit_res,
],
"get-emergency-limit"
=> [
"Counterpart of set-emergency-limit (per-resource emergency limit)",
\&emergency_limit_res,
],
"emergency-limit" => \&emergency_limit_res,
"cat"
=> [
"usage: cat <path>",
"Print internal debug output in human readable form.",
"Numerical timestamps and numerical error codes are replaced",
"by more readable means.",
"Example: marsadm cat /mars/5.total.status",
\&cat_cmd,
],
"show"
=> [
"verbose 3",
"Deprecated old low-level tool. Don't use. Use macros instead.",
\&show_cmd,
],
"show-errors"
=> [
"verbose 3",
"Deprecated old low-level tool. Don't use. Use macros instead.",
\&show_errors_cmd,
],
"show-state"
=> [
"verbose 3",
"Deprecated old low-level tool. Don't use. Use macros instead.",
\&mars_state_cmd,
],
"mars-state" => \&mars_state_cmd, # deprecated
"show-info"
=> [
"verbose 3",
"Deprecated old low-level tool. Don't use. Use macros instead.",
\&mars_info_cmd,
],
"mars-info" => \&mars_info_cmd, # deprecated
"pause-replay-local"
=> [
"Stop replaying transaction logfiles for now.",
"This is independent from any {pause,resume}-fetch operations.",
"This may be used for freezing the state of your replica for some",
"time, if you have enough space on /mars/.",
"Only useful on a secondary node.",
\&pause_replay_local_res,
],
"pause-replay-global"
=> [
"Like pause-replay-local, but affects all resource members",
"in the cluster (remotely).",
\&pause_replay_global_res,
],
"pause-replay"
=> [
"See pause-replay-local.",
\&pause_replay_local_res,
],
"resume-replay-local"
=> [
"Restart replaying transaction logfiles, when there is some",
"data left.",
"This is independent from any {pause,resume}-fetch operations.",
"This should be used for unfreezing the state of your local replica.",
"Only useful on a secondary node.",
\&pause_replay_local_res,
],
"resume-replay-global"
=> [
"Like resume-replay-local, but affects all resource members",
"in the cluster (remotely).",
\&pause_replay_global_res,
],
"resume-replay"
=> [
"See resume-replay-local.",
\&pause_replay_local_res,
],
"set-replay"
=> [
"verbose 1",
"VERY dangerous!",
"Only for experts.",
\&set_replay_res,
],
"wait-umount"
=> [
"Wait until /dev/mars/<resource_name> has disappeared in the",
"cluster (even remotely).",
"Useful on both primary and secondary nodes.",
\&wait_umount_res,
],
"wait-cluster"
=> [
"usage: wait-resource [<resource_name>]",
"Waits until a ping-pong communication has succeeded in the",
"whole cluster (or only the members of <resource_name>).",
"NOTICE: this is extremely useful for avoiding races when scripting",
"in a cluster.",
\&wait_cluster,
],
"wait-resource"
=> [
"usage: wait-resource <resource_name>",
" [[attach|fetch|replay|sync][-on|-off]]",
"Wait until the given condition is met on the resource, locally.",
\&wait_cond,
],
"update-cluster"
=> [
"usage: update-cluster [<resource_name>]",
"Fetch all the links from all joined cluster hosts.",
"Use this between create-resource and join-resource.",
"NOTICE: this is extremely useful for avoiding races when scripting",
"in a cluster.",
\&update_cluster,
],
# compatible keywords (or their derivatives)
"attach"
=> [
"Attaches the local disk (backing block device) to the resource.",
"The disk must have been previously configured at",
"{create,join}-resource.",
"When designated as a primary, /dev/mars/\$res will also appear.",
"This does not change the state of {fetch,replay}.",
"For a complete local startup of the resource, use 'marsadm up'.",
\&attach_res_phase0,
"check preconditions",
\&attach_res_phase1,
"switch state",
\&attach_res_phase2,
"wait for effect",
],
"detach"
=> [
"Detaches the local disk (backing block device) from the",
"MARS resource.",
"Caution! you may read data from the local disk afterwards,",
"but ensure that no data is written to it!",
"Otherwise, you are likely to produce harmful inconsistencies.",
"When running in primary role, /dev/mars/\$res will also disappear.",
"This does not change the state of {fetch,replay}.",
"For a complete local shutdown of the resource, use 'marsadm down'.",
\&attach_res_phase0,
"check preconditions",
"FORK",
\&attach_res_phase0b,
"wait for systemd device release",
\&attach_res_phase1,
"switch state",
\&attach_res_phase2,
"wait for effect",
],
"resume-fetch-local"
=> [
"Start fetching transaction logfiles from the current",
"designated primary node, if there is one.",
"This is independent from any {pause,resume}-replay operations.",
"Only useful on a secondary node.",
\&fetch_local_res,
],
"resume-fetch-global"
=> [
"Like resume-fetch-local, but affects all resource members",
"in the cluster (remotely).",
\&fetch_global_res,
],
"resume-fetch"
=> [
"See resume-fetch-local.",
\&fetch_local_res,
],
"pause-fetch-local"
=> [
"Stop fetching transaction logfiles from the current",
"designated primary.",
"This is independent from any {pause,resume}-replay operations.",
"Only useful on a secondary node.",
\&fetch_local_res,
],
"pause-fetch-global"
=> [
"Like pause-fetch-local, but affects all resource members",
"in the cluster (remotely).",
\&fetch_global_res,
],
"pause-fetch"
=> [
"See pause-fetch-local.",
\&fetch_local_res,
],
"connect-local"
=> [
"See resume-fetch-local.",
\&fetch_local_res,
],
"connect-global"
=> [
"Like resume-fetch-local, but affects all resource members",
"in the cluster (remotely).",
\&fetch_global_res,
],
"connect"
=> [
"See resume-fetch-local.",
\&fetch_local_res,
],
"disconnect-local"
=> [
"See pause-fetch-local.",
\&fetch_local_res,
],
"disconnect-global"
=> [
"Like pause-fetch-local, but affects all resource members",
"in the cluster (remotely).",
\&fetch_global_res,
],
"disconnect"
=> [
"See pause-fetch-local.",
\&fetch_local_res,
],
"syncer" => \&ignore_cmd,
"up"
=> [
"Shortcut for attach + resume-sync + resume-fetch + resume-replay.",
\&up_res_phase0,
"check preconditions",
\&up_res_phase1,
"switch state",
\&up_res_phase2,
"wait for effect",
],
"down"
=> [
"Shortcut for detach + pause-sync + pause-fetch + pause-replay.",
\&up_res_phase0,
"check preconditions",
"FORK",
\&attach_res_phase0b,
"wait for systemd device release",
\&up_res_phase1,
"switch state",
\&up_res_phase2,
"wait for effect",
],
"primary"
=> [
"Promote the resource into primary role.",
"This is necessary for /dev/mars/\$res to appear on the local host.",
"Notice: by concept there can be only _one_ designated primary",
"in a cluster at the same time.",
"The role change is automatically distributed to the other nodes",
"in the cluster, provided that the network is healthy.",
"The old primary node will _automatically_ go",
"into secondary role first. This is different from DRBD!",
"With MARS, you don't need an intermediate 'secondary' command",
"for switching roles.",
"It is usually better to _directly_ switch the primary roles",
"between both hosts.",
"When --force is not given, a planned handover is started:",
"the local host will only become actually primary _after_ the",
"old primary is gone, and all old transaction logs have been",
"fetched and replayed at the new designated priamry.",
"When --force is given, no handover is attempted. A a consequence,",
"a split brain situation is likely to emerge.",
"Thus, use --force only after an ordinary handover attempt has",
"failed, and when you don't care about the split brain.",
"For more details, please refer to the PDF manual.",
[
\&wait_cluster_noforce,
],
\&primary_phase0,
"check preconditions",
"FORK",
\&primary_phase0a,
"conditionally wait for fetch off",
\&primary_phase0b,
"wait for systemd",
"LOOP",
\&primary_phase1,
"leave primary state",
"LOOP",
\&primary_phase1b,
"trigger remote",
"LOOP",
\&primary_phase2,
"wait for cluster when necessary",
"LOOP",
\&primary_phase2b,
"avoid split brain",
\&primary_phase3,
"switch to primary",
\&primary_phase3b,
"trigger remote",
\&primary_phase4,
"wait for device",
],
"secondary"
=> [
"Promote all cluster members into secondary role, globally.",
"In contrast to DRBD, this is not needed as an intermediate step",
"for planned handover between an old and a new primary node.",
"The only reasonable usage is before the last leave-resource of the",
"last cluster member, immediately before leave-cluster is executed",
"for final deconstruction of the cluster.",
"In all other cases, please prefer 'primary' for direct handover",
"between cluster nodes.",
"Notice: 'secondary' sets the global designated primary node",
"to '(none)' which in turn prevents the execution of 'invalidate'",
"or 'join-resource' or 'resize' anywhere in the cluster.",
"Therefore, don't unnecessarily give 'secondary'!",
\&primary_phase0,
"check preconditions",
"FORK",
\&primary_phase1,
"leave primary state",
\&primary_phase1b,
"trigger remote",
\&primary_phase4,
"wait for effect",
],
"invalidate"
=> [
"Only useful on a secondary node.",
"Forces MARS to consider the local replica disk as being",
"inconsistent, and therefore starting a fast full-sync from",
"the currently designated primary node (which must exist;",
"therefore avoid the 'secondary' command).",
"This is usually needed for resolving emergency mode.",
"When having k=2 replicas, this can be also used for",
"quick-and-simple split-brain resolution.",
"In other cases, or when the split-brain is not resolved by",
"this command, please use the 'leave-resource' / 'join-resource'",
"method as described in the PDF manual (in the right order as",
"described there).",
\&invalidate_res_phase0,
"check preconditions",
\&invalidate_res_phase1,
"switch off everything",
\&invalidate_res_phase2,
"wait for everything switched off",
\&invalidate_res_phase3,
"start deletion of old version links",
\&invalidate_res_phase4,
"wait for effect",
\&invalidate_res_phase5,
"start purging old logfiles",
\&invalidate_res_phase6,
"wait for effect",
\&invalidate_res_phase7,
"set new replaylink",
\&invalidate_res_phase8,
"wait for effect",
\&invalidate_res_phase9,
"switch on everything again",
],
"invalidate-remote" => \&forbidden_cmd,
"resize"
=> [
"Prerequisite: all underlying disks (usually /dev/vg/\$res) must",
"have been already increased, e.g. at the LVM layer (cf. lvresize).",
"Causes MARS to re-examine all sizing constraints on all members of",
"the resource, and increase the global logical size of the resource",
"accordingly.",
"Shrinking is currently not yet implemented.",
"When successful, /dev/mars/\$res at the primary will be increased",
"in size. In addition, all secondaries will start an incremental",
"fast full-sync to get the enlarged parts from the primary.",
\&resize_phase0,
"check preconditions",
\&resize_phase1,
"set new size",
\&resize_phase2,
"wait for change",
],
"check-resize" => \&ignore_cmd,
"create-md" => \&senseless_cmd,
"get-gi" => \&ignore_cmd,
"show-gi" => \&ignore_cmd,
"dump-md" => \&senseless_cmd,
"outdate" => \&ignore_cmd,
"adjust" => \&ignore_cmd,
"wait-connect"
=> [
"usage: wait-connect [<resource_name>]",
"See wait-cluster.",
\&wait_cluster,
],
"role"
=> [
"verbose 3",
"Deprecated.",
"Please use the macro command 'view-role' instead.",
"For even better summary information, use plain 'view'.",
\&role_cmd,
],
"state"
=> [
"verbose 3",
"Deprecated.",
"Please use the macro command 'view-role' instead.",
"For even better summary information, use plain 'view'.",
\&role_cmd,
],
"cstate" => \&nyi_cmd,
"dstate" => \&nyi_cmd,
"status" => \&nyi_cmd,
"dump" => \&senseless_cmd,
"verify" => \&forbidden_cmd,
"pause-sync-local"
=> [
"Pause the initial data sync at current stage.",
"This has only an effect if a sync is actually running (i.e.",
"there is something to be actually synced).",
"Don't pause too long, because the local replica will remain",
"inconsistent during the pause.",
"Use this only for limited reduction of system load.",
"Only useful on a secondary node.",
\&pause_sync_local_res,
],
"pause-sync-global"
=> [
"Like pause-sync-local, but affects all resource members",
"in the cluster (remotely).",
\&pause_sync_global_res,
],
"pause-sync"
=> [
"See pause-sync-local.",
\&pause_sync_local_res,
],
"resume-sync-local"
=> [
"Resume any initial / incremental data sync at the stage where it",
"had been interrupted by pause-sync.",
"Only useful on a secondary node.",
\&pause_sync_local_res,
],
"resume-sync-global"
=> [
"Like resume-sync-local, but affects all resource members",
"in the cluster (remotely).",
\&pause_sync_global_res,
],
"resume-sync"
=> [
"See resume-sync-local.",
\&pause_sync_local_res,
],
"new-current-uuid" => \&senseless_cmd,
"hidden-commands" => \&ignore_cmd,
# lowlevel tools
"lowlevel-ls-host-ips"
=> [
"usage: lowlevel-ls-host-ips",
"List cluster member names and IP addresses.",
\&lowlevel_ls_host_ips,
],
"lowlevel-set-host-ip"
=> [
"usage: lowlevel-ls-host-ips <hostname> <new_ip>",
"Set IP for host.",
\&lowlevel_set_host_ip,
],
"lowlevel-delete-host"
=> [
"usage: lowlevel-ls-host-ips <hostname>",
"Delete cluster member.",
\&lowlevel_delete_host,
],
# systemd interface
"systemd-trigger"
=> [
\&systemd_trigger,
],
);
sub helplist {
my $msg = shift;
print "ERROR: $msg" if ($msg);
print "
Thorough documentation is in mars-user-manual.pdf. Please use the PDF manual
as authoritative reference! Here is only a short summary of the most
important sub-commands / options:
marsadm [<global_options>] <command> [<resource_names> | all | <args> ]
marsadm [<global_options>] view[-<macroname>] [<resource_names> | all ]
<global_option> =
--force
Skip safety checks.
Use this only when you really know what you are doing!
Warning! This is dangerous! First try --dry-run.
Not combinable with 'all'.
--ignore-sync
Allow primary handover even when some sync is running somewhere.
This is less rude than --force because it checks for all else
preconditions.
--dry-run
Don't modify the symlink tree, but tell what would be done.
Use this before starting potentially harmful actions such as
'delete-resource'.
--verbose
Increase speakyness of some commands.
--parallel
Only resonable when combined with \"all\".
For each resource, fork() a sub-process running independently
from other resources. May seepd up handover a lot.
However, several cluster managers are known to have problems
with a high parallelism degree (up to deadlocks).
Only use this after thorough testing in combination with your
whole operation stack!
--parallel=<number>
Like --parallel, but limit the parallelism degree to the given
number of parallel processes.
--delete-method=<code>
EXPERIMENTAL! Only for testing! This option will disappear again!
<code> == 0: Use new deletion method
<code> == 1: Use old deletion method
default is 1 for compatibility.
--logger=/path/to/usr/bin/logger
Use an alternative syslog messenger.
When empty, disable syslogging.
--max-deletions=<number>
When your network or your firewall rules are defective over a
longer time, too many deletion links may accumulate at
/mars/todo-global/delete-* and sibling locations.
This limit is preventing overflow of the filesystem as well
as overloading the worker threads.
--thresh-logfiles=<number>
--thresh-logsize=<number>
Prevention of too many small logfiles when secondaries are not
catching up. When more than thresh-logfiles are already present,
the next one is only created when the last one has at least
size thresh-logsize (in units of GB).
--timeout=<seconds>
Current default: $timeout
Abort safety checks and waiting loops after timeout with an error.
When giving 'all' as resource agument, this works for each
resource independently.
The special value -1 means \"infinite\".
--window=<seconds>
Current default: $window
Treat other cluster nodes as healthy when some communcation has
occured during the given time window.
--threshold=<bytes>
Some macros like 'fetch-threshold-reached' use this for determining
their sloppyness.
--host=<hostname>
Act as if the command was running on cluster node <hostname>.
Warning! This is dangerous! First try --dry-run
--backup-dir=</absolute_path>
Only for experts.
Used by several special commands like merge-cluster, split-cluster
etc for creating backups of important data.
--ip=<ip>
Override the IP address stored in the symlink tree, as well as
the default IP determined from the list of network interfaces.
Usually you will need this only at 'create-cluster' or
'join-cluster' for resolving ambiguities.
--ssh-port=<port_nr>
Override the default ssh port (22) for ssh and rsync.
Useful for running {join,merge}-cluster on non-standard ssh ports.
--ssh-opts=\"<ssh_commandline_options>\"
Override the default ssh commandline options. Also used for rsync.
--macro=<text>
Handy for testing short macro evaluations at the command line.
<command> =
";
my $item_count = 0;
foreach my $cmdname (sort alphanum_cmp keys(%cmd_table)) {
my $list = $cmd_table{$cmdname};
next unless ref($list) eq "ARRAY";
my @copy = @$list;
my $line_count = 0;
my $verb_level = 0;
while (my $txt = shift @copy) {
last if ref($txt) eq "CODE";
next if ref($txt) eq "ARRAY";
my $skip_this = 0;
if ($txt =~ m/^verbose *([0-9]+)/) {
$verb_level = $1;
$skip_this = 1;
}
if ($verbose < 0) {
last if $verb_level != -$verbose;
} else {
last if $verb_level > $verbose;
}
next if $skip_this;
if (!$line_count) {
print "\n" if $item_count++;
print " $cmdname\n";
print " usage: $cmdname <resource_name>\n" if $txt !~ m/usage:/;
}
print " $txt\n";
$line_count++;
}
}
print "
<resource_names> = comma-separated list of resource names or \"all\" for all resources
";
print "
<macroname> = <complex_macroname> | <primitive_macroname>
<complex_macroname> =
";
foreach my $macroname (sort alphanum_cmp keys(%complex_macros)) {
print " $macroname\n";
}
print "
<primitive_macroname> =
";
foreach my $glob (sort alphanum_cmp keys(%trivial_globs)) {
my $txt = $trivial_globs{$glob};
print " $glob\n";
print " $txt\n" if $txt;
}
exit 0;
}
my @args;
foreach my $arg (@ARGV) {
if ($arg eq "--force" || $arg eq "-f") {
$force++;
next;
} elsif ($arg eq "--ignore-sync") {
$ignore_sync++;
next;
} elsif ($arg eq "--dry-run" || $arg eq "-d") {
$dry_run++;
next;
} elsif ($arg =~ m/--max-deletions\s*=\s*(-?[0-9]+)/) {
$max_deletions = $1;
next;
} elsif ($arg =~ m/--thresh-logfiles\s*=\s*([0-9]+)/) {
$thresh_logfiles = $1;
next;
} elsif ($arg =~ m/--thresh-logsize\s*=\s*([0-9]+)/) {
$thresh_logsize = $1;
next;
} elsif ($arg =~ m/--parallel(\s*=\s*(-?[0-9]+)?)?/) {
$parallel = defined($2) ? $2 : 0;
next;
} elsif ($arg =~ m/--delete-method\s*=\s*([0-9]+)/) {
$compat_deletions = $1;
my $compat_path = "$mars/compat-deletions";
my $old = get_link($compat_path, 2);
if (!defined($old) || $old ne $compat_deletions) {
if (-d "/mars" && !$dry_run) {
lprint "Setting '$compat_path' to '$compat_deletions'\n";
set_link($compat_deletions, $compat_path);
finish_links();
} else {
lwarn "Cannot set '$compat_path' to '$compat_deletions'\n";
}
}
next;
} elsif ($arg =~ s/--verbose\s*=\s*(-?[0-9]+)/$1/) {
$verbose = $arg;
next;
} elsif ($arg eq "--verbose" || $arg eq "-v") {
$verbose++;
next;
} elsif ($arg =~ s/--logger\s*=\s*(.*)/$1/) {
$logger = $arg;
next;
} elsif ($arg =~ s/--timeout\s*=\s*([0-9]+)/$1/) {
$timeout = $arg;
next;
} elsif ($arg =~ s/--window\s*=\s*([0-9]+)/$1/) {
$window = $arg;
next;
} elsif ($arg =~ s/--threshold\s*=\s*([0-9]+)/$1/) {
$threshold = get_size($arg);
next;
} elsif ($arg =~ s/--host\s*=\s*([-_A-Za-z0-9]+)/$1/) {
check_id($arg, 1);
if ($arg ne $host) {
lprint "ATTENTION: acting as if I were host '$arg'\n";
lwarn "some commands require local knowledge not available here.\n";
lwarn "thus something may fail or go wrong - use this at your risk!\n";
$host = $arg;
}
next;
} elsif ($arg =~ m/--backup-dir\s*=\s*(\/[^\s\\:;|<>]+)/) {
$backup_dir = $1;
system("mkdir -p $backup_dir") and ldie "Cannot create backup directory '$backup_dir'\n";
next;
} elsif ($arg =~ s/--ip\s*=\s*([0-9.:\[\]]+)/$1/) {
$ip = $arg;
lprint_stderr "Using IP '$ip' from command line.\n";
next;
} elsif ($arg =~ s/--ssh[-_]port\s*=\s*([0-9]+)/$1/) {
$ssh_port = $arg;
lprint_stderr "Using SSH port '$ssh_port' from command line.\n";
next;
} elsif ($arg =~ s/--ssh[-_]opts\s*=\s*(.*)/$1/) {
$ssh_opts = $arg;
lprint_stderr "Using SSH options '$ssh_opts' from command line.\n";
next;
} elsif ($arg =~ s/--macro\s*=\s*(.*)/$1/) {
$macro = $arg;
$macro =~ s/\\n/\n/mg;
next;
} elsif ($arg =~ m/^--help$/ || $arg =~ m/^-h$/) {
helplist;
next;
} elsif ($arg =~ m/^--version$/ || $arg =~ m/^-v$/) {
version;
next;
} elsif ($arg =~ m/^-(.*)/) {
ldie "unrecognized argument '-$1' (bad syntax)\n";
}
if ($arg =~ s/^force-//) {
$force++;
}
push @args, $arg;
}
my $cmd = shift @args || helplist "command argument is missing\n";
if ($cmd =~ m/^help$/ || $cmd =~ m/^h$/) {
helplist;
}
if ($cmd =~ m/^version$/ || $cmd =~ m/^v$/) {
version;
}
ldie "only root may use this tool\n" if $< != 0 && $cmd !~ m/^(cat|view.*|pretty.*)$/; # getpid() seems to be missing in perlfunc
helplist "unknown command $cmd\n" if (!exists $cmd_table{$cmd} && !$cmd =~ m/view/);
# setup syslogging
if ($cmd !~ m/^(version$|v$|view)/ && -x $logger) {
$notify = "(cmd: $cmd)";
my $print_id = $Id;
$print_id =~ s/\$|Id:| //g;
$print_id = substr($print_id, 0, 8);
llog "$print_id $host $0 @ARGV\n";
}
# checks
ldie "The $mars directory does not exist.\n" unless -d $mars;
my @mars_stat = stat($mars) or ldie "Cannot stat '$mars'\n";
# check uid
if ($mars_stat[4]) {
lwarn "Directory $mars has wrong owner uid $mars_stat[4].\n";
chown(0, 0, $mars) or ldie "Cannot chown $mars.\n";
lwarn "Fixed by setting to 0.\n";
}
# check permissions
my $perms = ($mars_stat[2] & 07777);
if ($perms != 0700) {
my $txt = sprintf("Directory $mars has wrong permissions 0%o.\n", $perms);
lwarn($txt);
chmod(0700, $mars) or ldie "Cannot chmod $mars.\n";
lwarn "Fixed to mode 0700 for security reasons.\n";
}
# lowlevel tools
if ($cmd =~ m/^lowlevel-/) {
my $func = $cmd_table{$cmd};
if ($func && ref($func) eq "ARRAY") {
lwarn "EXPERTS ONLY -- risky lowlevel command '$cmd'\n";
my @list = @$func;
while (@list) {
my $memb_func = shift @list;
next unless ref($memb_func) eq "CODE";
&{$memb_func}($cmd, @args);
}
finish_links();
exit(0);
} else {
ldie "Internal error: command table is wrong for '$cmd'";
}
}
if ($cmd !~ m/(create|join)-cluster|create-uuid|systemd|cat|view|pretty/) {
my $uuid = get_link("$mars/uuid", 1);
ldie "No valid cluster UUID $mars/uuid found. You need {create,join}-cluster first (or create-uuid).\n" unless $uuid;
}
my $res = "";
if ($cmd =~ "show|cron") {
$res = shift @args || "all";
} elsif ($cmd =~ m/^(view|pretty)/) {
$res = shift @args || "";
} elsif ($cmd =~ m/^set-.*-list$/) {
$res = shift @args || helplist "comma-separated list argument is missing\n";
} elsif ($cmd =~ m/^set-.*-value$/) {
$res = shift @args || helplist "numeric argument is missing\n";
ldie "argument '$res' isn't numeric\n" unless $res =~ m/^[0-9.]+$/;
} elsif ($cmd =~ m/^(join|merge)-cluster$/) {
$res = shift @args || helplist "peer argument is missing\n";
} elsif (!($cmd =~ m/^(create|split|leave|wait|update)-cluster|merge-cluster-list|create-uuid|cat|[a-z]+-file|trigger/)) {
$res = shift @args || helplist "resource argument is missing\n";
check_id_list($res);
}
lwarn "Using FORCE option -- hopefully you know what you are doing!\n" if $force;
my %checked_res;
sub do_one_res {
my $func = shift;
my ($cmd, $res) = @_;
if ($cmd =~ m/^cat|-file$|-list$|-link$|-value$/) { # no resource argument
} elsif (!$checked_res{"$cmd$res"}) {
$res = check_res($res) unless (!$res || $cmd =~ m/^(join|create|merge|leave|wait)-cluster|(create|join)-resource|show/);
check_res_member($cmd, $res) unless (!$res || $cmd =~ m/^(join|create|delete)-(cluster|resource)|^(merge|leave|wait)-cluster|-purge-|^show|^view/);
detect_splitbrain($res, 1);
$checked_res{"$cmd$res"} = 1;
}
call_hook(!$force, "pre", @_);
my $status = &{$func}(@_);
call_hook(!$force, "post", @_);
return $status;
}
my %skip_res;
sub expand_res_list {
my ($cmd, $res) = @_;
my @res_list=();
if ($res eq "all" && $cmd !~ m/show|cat|cluster|set-link|delete-file/) {
@res_list = get_all_resources();
} elsif ($res =~ m/,/) {
@res_list = split(",", $res);
}
return sort alphanum_cmp @res_list;
}
sub do_all_res {
my $func = shift;
my $do_abort = shift;
my $cmd = shift;
my $res = shift;
my @res_list = expand_res_list($cmd, $res);
if (@res_list) {
ldie "Cannot combine command '$cmd' with 'all' existing resources - you must explicitly name a single new resource\n" if $cmd =~ m/create|join/;
my $any_success = 0;
my $any_fail = 0;
my $any_member = 0;
my @total_list = lamport_glob("$mars/ips/ip-*");
my $total_count = scalar(@total_list);
call_hook(!$force, "all-pre", $cmd, "all", @_) if $do_abort;
foreach $res (@res_list) {
my $check ="$mars/resource-$res/data-$host";
next unless (any_exists($check));
$any_member++;
$res =~ s/^.*\/resource-(.*)$/$1/;
next if defined($skip_res{$res});
if ($verbose || $cmd !~ m/^log-/) {
my $tpl = get_macro("default-resource");
my $hint = eval_macro($cmd, $res, $tpl, @_);
lprint "--------- resource $hint\n";
}
if (!$do_abort) {
my $status = do_one_res($func, $cmd, $res, @_);
$any_fail++ if $status;
next;
}
# catch internal ldie() via eval{} failure
eval {
do_one_res($func, $cmd, $res, @_);
1;
} and
# eval{} has succeeded
$any_success = 1 or
# eval{} has failed
(
$skip_res{$res} = 1
);
}
return $any_fail unless $do_abort;
if (!$any_success) {
if (!$any_member) {
lprint "I am not member of any resource\n";
return 1;
}
ldie "all resources have errors\n";
}
call_hook(!$force, "all-post", $cmd, "all", @_);
return !$any_success;
} else {
return do_one_res($func, $cmd, $res, @_);
}
}
if ($cmd =~ m/^(view|pretty)/) {
lwarn "mars kernel module is not loaded\n" unless is_module_loaded();
do_all_res(\&view_cmd, 1, $cmd, $res, @args);
if ($res eq "all" && $cmd =~ m/^view-?(.*)/) {
my $global_macro_name = $1 ? "$1-global" : "default-global";
my $global_macro = get_macro($global_macro_name, 1);
view_cmd($global_macro_name, "", @args) if $global_macro;
}
finish_links();
get_alive_links("all", "alive", "*", 1);
exit($error_count);
}
if (!$ip) {
$ip = _get_ip() or ldie "cannot determine my IP address\n";
}
my $func = $cmd_table{$cmd};
ldie "unknown command '$cmd'\n" unless $func;
my %kid_res;
sub wait_pid_list {
my @pid_list = @_;
foreach my $pid (@pid_list) {
my $check_pid = waitpid($pid, 0);
my $status = $?;
my $sub_res = $kid_res{$pid};
if ($status > 0) {
lwarn "RESOURCE $sub_res CHILD $pid terminated with status=$status\n";
$error_count += $status;
} elsif ($check_pid == $pid) {
lprint "RESOURCE $sub_res CHILD $pid terminated successfully\n";
} else {
lwarn "RESOURCE $sub_res CHILD $pid terminated with unknown state\n";
}
}
}
sub wait_any_pid {
my ($flags) = @_;
lprint "WAITING for termination of a child...\n";
my $pid = waitpid(-1, $flags);
my $status = $?;
if ($pid > 0) {
my $sub_res = $kid_res{$pid};
if ($status != 0) {
lwarn "RESOURCE $sub_res CHILD $pid terminated with status=$status\n";
$error_count += abs($status);
} else {
lprint "RESOURCE $sub_res CHILD $pid terminated successfully\n";
}
}
return $pid;
}
sub do_fork {
my @res_list = expand_res_list($cmd, $res);
my $child_count = 0;
# only fork() when beneficial
if (@res_list && scalar(@res_list) > 1) {
foreach my $child_res (@res_list) {
# when necessary, limit the parallelism degree
if ($parallel > 0 && $child_count >= $parallel) {
my $done_pid = wait_any_pid(0);
if ($done_pid > 0) {
$child_count--;
delete $kid_res{$done_pid};
}
}
my $pid = fork();
ldie "Cannot fork()\n" unless defined($pid);
if ($pid) {
lprint "RESOURCE $child_res starting CHILD $pid\n";
$child_count++;
$kid_res{$pid} = $child_res;
} else {
# child: simply continue with new $res
$res = $child_res;
$child_count = 0;
$child_prefix = "CHILD $child_res: ";
lprint "STARTING\n";
last;
}
}
}
if ($child_count) {
my @wait_list = sort alphanum_cmp keys(%kid_res);
wait_pid_list(@wait_list);
lprint "EXIT $error_count\n";
exit($error_count);
}
}
if (ref($func) eq "ARRAY") {
my @list = @$func;
while (@list) {
my $do_loop = 0;
my $memb_func = shift @list;
# check whether fork() is possible
if ($memb_func) {
if ($memb_func eq "FORK") {
$memb_func = shift @list;
# check whether fork() is requested
if ($parallel >= 0) {
do_fork();
}
}
# check whether busy-waiting loop is requested
if ($memb_func eq "LOOP") {
$memb_func = shift @list;
$do_loop++;
}
}
# nested arrays may be used for _global_ workers
if (ref($memb_func) eq "ARRAY") {
my @sub_list = @$memb_func;
foreach my $sub_func (@sub_list) {
next unless ref($sub_func) eq "CODE";
&{$sub_func}($cmd, $res, @args);
}
}
next unless ref($memb_func) eq "CODE";
my $headline = shift @list;
lprint "---------------------------- $headline:\n" if defined($headline);
my $start_time = mars_time();
my $do_abort = !$do_loop;
my $status;
for (;;) {
$status = do_all_res($memb_func, $do_abort, $cmd, $res, @args);
last if !$do_loop;
last if (!defined($status) || !$status);
# we have a busy wait condition
sleep(1);
my $now = mars_time();
if ($now - $start_time > $timeout) {
lwarn "Condition '$headline' for resources '$res' not reached withing $timeout s\n";
$do_abort = 1;
}
}
last if (defined($status) && $status);
finish_links();
}
} elsif (ref($func) eq "CODE") {
do_all_res($func, 1, $cmd, $res, @args);
} else {
ldie "internal error: command table is wrong for '$cmd'";
}
finish_links();
lprint "EXIT $error_count\n" if $child_prefix;
exit($error_count);