marsadm: implement prosumer handover on primary handover

This commit is contained in:
Thomas Schoebel-Theuer 2020-11-04 16:36:43 +01:00 committed by Thomas Schoebel-Theuer
parent c532ff8f8a
commit e6d6a9b2fe
1 changed files with 456 additions and 19 deletions

View File

@ -3062,7 +3062,7 @@ sub check_primary {
my $fail = 0;
my $lnk = "$mars/resource-$res/actual-$host/is-primary";
my $is_primary = get_link($lnk, 1);
if (!$is_primary) { # give it a second chance
if (!$is_primary && todo_local($cmd, $res)) { # give it a second chance
$is_primary = device_exists($res);
}
unless ($is_primary) {
@ -5127,10 +5127,15 @@ sub create_res {
set_link("1", "$todo/connect");
set_link("1", "$todo/sync");
set_link("1", "$todo/allow-replay");
set_link("0", "$todo/detach-device");
set_link("0", "$todo/kill-device");
set_link("(local)", "$todo/exports");
set_link("0", "$todo/multi-prosumer");
unlink("$resdir/syncstatus-$host");
my $replay_ok = 1;
if ($create) {
set_link("(local)", "$resdir/prosumer");
set_link($host, "$resdir/primary");
set_link($host, "$resdir/userspace/last-primary");
set_link($size, "$resdir/syncstatus-$host");
@ -6234,6 +6239,8 @@ sub _primary_res {
lprint "designated primary changed from '$old' to '$new'\n";
}
my %gate_set;
sub _set_gate {
my ($cmd, $res, $peers) = @_;
my $touched = 0;
@ -6241,6 +6248,7 @@ sub _set_gate {
lprint "Closing gate at '$peer'\n";
my $lnk = "$mars/resource-$res/todo-$peer/gate-mask";
set_link($gate_code, $lnk);
$gate_set{$peer} = $gate_code;
}
}
@ -6253,6 +6261,9 @@ sub _reset_gate {
next if $val eq "0x0";
lprint "Resetting gate of '$res'\n";
set_link("0x0", $lnk);
$lnk =~ m:/todo-([^/]*)/:;
my $peer = $1;
$gate_set{$peer} = 0;
}
}
@ -6281,13 +6292,28 @@ sub _reset_current_primary {
# check whether primary/secondary switching is possible at all
sub primary_phase0 {
my ($cmd, $res) = @_;
ldie "cannot switch primary: mars kernel module is not loaded\n" unless ($cmd eq "secondary" || -d "/proc/sys/mars");
ldie "cannot switch primary: mars kernel module is not loaded\n" unless ($cmd =~ m/secondary/ || -d "/proc/sys/mars");
if ($force) {
check_todo($cmd, $res, "fetch", 0, 0);
}
_reset_gate(@_);
_reset_new_primary(@_);
finish_links();
_trigger(3);
my $old = _get_designated_primary($cmd, $res, -1);
my $new = parse_list_spec($cmd_suffix{$cmd}, undef, $host);
ldie "Cannot work with multiple primaries in parallel\n" if $new =~ m/[+]/;
if ($new && $new ne "(local)") {
# Provisionary: map the new syntax to the old one
if ($new eq "(none)") {
$cmd = "secondary";
} else {
# provisionary, to disappear
$host = $new;
}
}
lprint "Current designated primary: $old\n";
if ($cmd eq "primary") {
if ($cmd =~ m/primary/) {
if ($host ne $old) {
lprint "Allowing handover in cases of sync: ignore_sync=$ignore_sync\n" if $ignore_sync;
check_sync_finished($res, $host, $ignore_sync);
@ -6315,19 +6341,27 @@ sub primary_phase0 {
ldie "Won't switch to avoid unnoticed data loss. You may however do a 'primary --force'.\n" unless $force;
}
}
if (!todo_local(@_)) {
my $status = _check_logger_states($cmd, $res, $old, $new, 0, 1);
if ($status) {
ldie "Apparently hosts '$old' and '$new' seem to work on different storage ids\n";
}
}
my $want_path = "$mars/resource-$res/systemd-want";
my $want = get_link($want_path, 2);
if ($want) {
my $new;
my $oper;
if ($cmd eq "primary") {
if ($cmd =~ m/primary/) {
$new = $host;
$oper = "start";
} else {
$new = "(none)";
$oper = "stop";
}
set_link($new, $want_path);
if (todo_local($cmd, $res)) {
set_link($new, $want_path);
}
my $unit_path = "$mars/resource-$res/systemd-$oper-unit";
my $unit = get_link($unit_path, 2);
if ($unit) {
@ -6341,11 +6375,13 @@ sub primary_phase0 {
return 0;
}
}
return 0 if ($old eq $host and $cmd eq "primary");
return 0 if ($old eq $host && $cmd =~ m/primary/);
return 0 if $old eq "(none)";
my $open_count_path = "$mars/resource-$res/actual-$old/open-count";
my $device_in_use = get_link($open_count_path, 1);
if ($device_in_use) {
if ($device_in_use &&
($cmd =~ m/secondary/ ||
todo_local(@_))) {
my $dev = device_name($res, $old);
lwarn "device '$dev' for resource '$res' is $device_in_use times in use on primary host '$old'\n";
ldie "first you must umount/close the device (on host '$old')\n" unless $force;
@ -6368,6 +6404,19 @@ sub primary_phase0a {
lprint "Prepare new primary '$new' handover\n";
_switch($cmd, $res, "$mars/resource-$res/todo-$new/fetch", 1);
}
if (!todo_local(@_)) {
my $prosumers = get_prosumers(@_);
$prosumers = "(none)" if !$prosumers;
my $lnk = "$mars/resource-$res/new-primary";
my $new = $host;
lprint "Prepare new primary '$new'\n";
set_link($new, $lnk);
$lnk = "$mars/resource-$res/todo-$new/exports";
set_link($prosumers, $lnk);
$lnk = "$mars/resource-$res/todo-$new/multi-prosumer";
my $val = ($prosumers =~ m/[+]/) ? "1" : "0";
set_link($val, $lnk);
}
finish_links();
return 0 unless $force;
wait_cond($cmd, $res, "is-fetch-off");
@ -6416,36 +6465,218 @@ sub primary_phase0b {
return 0;
}
sub compensate_primary_fail_prepared {
my ($cmd, $res) = @_;
lprint "Resetting gate and primary handover infrastucture...\n";
_reset_gate($cmd, $res);
_reset_new_primary($cmd, $res);
finish_links();
}
sub compensate_primary_fail_switched {
my ($cmd, $res) = @_;
compensate_primary_fail_prepared(@_);
lprint "Switching back to previous primary...\n";
_reset_current_primary($cmd, $res);
lprint "Sorry, I cannot prevent SPLIT BRAIN which may follow now.\n";
}
# when necessary, switch to secondary (intermediately)
# prosumer: use gate instead
sub primary_phase1 {
my ($cmd, $res) = @_;
return 0 if ($force and $cmd eq "primary");
my $old = _get_designated_primary($cmd, $res, -1);
return 0 if ($old eq $host and $cmd eq "primary");
if (!todo_local(@_)) {
finish_links();
my $prosumers = get_prosumers(@_);
if (!$force && $cmd =~ m/primary/) {
_set_gate($cmd, $res, $prosumers);
}
foreach my $peer (split("\\+", $prosumers)) {
my $detach_device = "0";
if ($cmd =~ m/primary/) {
$allow_fail_action = \&compensate_primary_fail_prepared;
lprint "Activating device at '$peer'\n";
} elsif ($cmd =~ m/secondary/) {
$detach_device = "1";
lprint "Removing device at '$peer'\n";
}
my $lnk = "$mars/resource-$res/todo-$peer/detach-device";
set_link($detach_device, $lnk);
}
finish_links();
return 0 unless $cmd =~ m/secondary/;
}
return 0 if ($force && $cmd =~ m/primary/);
my $old = $pri_old{$res};
return 0 if ($old eq $host && $cmd =~ m/primary/);
my $new = "(none)";
if (!$force and $cmd eq "primary") {
if (!$force && $cmd =~ m/primary/) {
my $status = try_to_avoid_splitbrain($cmd, $res, $old);
return $status if $status;
}
return 0 if $old eq $new;
_primary_res($res, $new, $old);
finish_links();
return 0;
}
sub _check_logger_states {
my ($cmd, $res, $old, $new, $after_stamp, $only_id) = @_;
my $old_path = "$mars/resource-$res/actual-$old/state-logger";
my $new_path = "$mars/resource-$res/actual-$new/state-logger";
# race prevention: stamps must be retrieved _first_
my $old_stamp = get_link_stamp($old_path);
my $new_stamp = get_link_stamp($new_path);
my $old_state = get_link($old_path);
my $new_state = get_link($new_path);
lprint "Old $old_stamp logger state: '$old_state' at $old\n" if $verbose;
lprint "New $new_stamp logger state: '$new_state' at $new\n" if $verbose;
if (!$old_state || !$new_state) {
lwarn "UNEXPECTED: undefined logger state\n";
return 1;
}
if ($only_id) {
$old_state =~ m/^[0-9.]+,([^,]+),/;
my $id_old = $1;
$new_state =~ m/^[0-9.]+,([^,]+),/;
my $id_new = $1;
if (!$id_old || !$id_new || $id_new ne $id_old) {
lprint "stor_id '$id_new' != '$id_old'\n";
return 1;
}
return 0;
}
if ($after_stamp) {
my $status = 0;
my $old_diff = $old_stamp - $after_stamp;
my $new_diff = $new_stamp - $after_stamp;
if ($old_diff <= 0) {
lprint "Old logger stamp is not yet recent ($old_diff s)\n";
$status = 1;
}
if ($new_diff <= 0) {
lprint "New logger stamp is not yet recent ($new_diff s)\n";
$status = 1;
}
if ($status) {
_trigger(3);
return 1;
}
lprint "Logger stamps are recent (old=$old_diff, new=$new_diff)\n";
}
$old_state =~ m/,([0-9])$/;
my $old_dirty = $1;
$new_state =~ m/,([0-9])$/;
my $new_dirty = $1;
if ($old_dirty) {
lprint "Old logger stamp is dirty\n";
return 1;
}
if ($new_dirty) {
lprint "New logger stamp is dirty\n";
return 1;
}
# strip off the stor_epoch, they are necessarily different
$old_state =~ s/^[0-9]+\.[0-9]+,//;
$new_state =~ s/^[0-9]+\.[0-9]+,//;
if ($old_state ne $new_state) {
lprint "Logger states are different\n";
return 1;
}
lprint "Logger states OK\n";
return 0;
}
my $phase2_waited = 0;
my %gate_stamps;
sub primary_phase1b {
my ($cmd, $res) = @_;
$phase2_waited = 0;
finish_links();
return 0 if $force;
my $check_logger = 0;
my $this_stamp = 0;
my $old = _get_designated_primary($cmd, $res, -1);
my $status = check_primary_gone($cmd, $res, $old);
return $status if $status;
if (!$force and $cmd eq "primary") {
my $new = $host;
if (todo_local(@_)) {
my $status = check_primary_gone($cmd, $res, $old);
return $status if $status;
} elsif ($cmd =~ m/primary/ &&
_get_designated_primary($cmd, $res, 0) ne "(none)") {
# check that gates have actually closed
_trigger(3);
my $prosumers = get_prosumers(@_);
foreach my $peer (split("\\+", $prosumers)) {
my $lnk = "$mars/resource-$res/actual-$peer/gate-on";
my $status = get_link($lnk, 1);
# When gate is not on, assume OrphanProsumer and ignore it.
# This can happen during incidents.
if (!defined($status) || !$status) {
lprint "Gate at '$peer' is not active.\n";
if (!$gate_set{$peer}) {
lprint "Ignoring inactive gate at '$peer'.\n";
next;
}
my $open_count_path = "$mars/resource-$res/actual-$peer/open-count";
my $device_in_use = get_link($open_count_path, 1);
if (!$device_in_use) {
lprint "Ignoring unused gate at '$peer'.\n";
next;
}
return 1;
}
$lnk = "$mars/resource-$res/actual-$peer/gate-mask";
$status = get_link($lnk, 1);
if (!defined($status) || $status eq "") {
lwarn "Gate at '$peer' does not respond.\n";
delete $gate_stamps{$res};
return 1;
}
# remove leading 0
$status =~ s/^0x0*(.)/0x$1/;
lprint "Gate at '$peer' is '$status'\n";
if ($status ne $gate_code) {
delete $gate_stamps{$res};
return 1;
}
# Get timestamp of lastly closed gate
if (!defined($gate_stamps{$res})) {
my $stamp = get_link_stamp($lnk);
$this_stamp = $stamp if $stamp > $this_stamp;
}
lprint "Gate at '$peer' is closed.\n";
}
# Check logger states
# First check validitiy
my $status = _check_logger_states($cmd, $res, $old, $new);
if ($status) {
delete $gate_stamps{$res};
return $status;
}
# Now remember the first occurence in time
if (!defined($gate_stamps{$res}) && $this_stamp) {
$gate_stamps{$res} = $this_stamp;
}
# Check again, this time also checking the timestamps
$check_logger = 1;
my $after_stamp = $gate_stamps{$res};
$status = _check_logger_states($cmd, $res, $old, $new, $after_stamp);
return $status if $status;
}
if (!$force && $cmd =~ m/primary/) {
my $status = try_to_avoid_splitbrain($cmd, $res, $old);
return $status if $status;
}
if ($check_logger) {
# paranoia: check once again after split-brain detection
my $after_stamp = $gate_stamps{$res};
my $status = _check_logger_states($cmd, $res, $old, $new, $after_stamp);
return $status if $status;
}
my $lnk = "$mars/resource-$res/todo-$host/detach-device";
my $detach_device = ($cmd =~ m/secondary/) ? "1" : "0";
set_link($detach_device, $lnk);
return 0;
}
@ -6453,7 +6684,9 @@ sub primary_phase1b {
sub primary_phase2 {
my ($cmd, $res) = @_;
return 0 if $force;
return 0 unless $cmd eq "primary";
return 0 unless $cmd =~ m/primary/;
finish_links();
return if !todo_local(@_);
wait_cluster($cmd) if !$phase2_waited++;
my $old = $pri_old{$res};
return check_primary_gone($cmd, $res, $old);
@ -6472,23 +6705,193 @@ sub primary_phase2b {
# when necessary, switch to primary
sub primary_phase3 {
my ($cmd, $res) = @_;
return 0 unless $cmd eq "primary";
return 0 unless $cmd =~ m/primary/;
my $old = _get_designated_primary($cmd, $res, -1);
$pri_old{$res} = $old;
my $new = $host;
_primary_res($res, $new, $old);
$allow_fail_action = \&compensate_primary_fail_switched;
my $prosumers = get_prosumers(@_);
if ($prosumers eq "(local)") {
my $lnk = "$mars/resource-$res/todo-$new/detach-device";
set_link("0", $lnk);
$lnk = "$mars/resource-$res/todo-$new/exports";
set_link("(local)", $lnk);
$lnk = "$mars/resource-$res/todo-$new/multi-prosumer";
set_link("0", $lnk);
}
return 0;
}
sub primary_phase3b {
my ($cmd, $res) = @_;
finish_links();
if (!$force && !todo_local(@_)) {
# check that prosumer handover is prepared for the right primary
my $primary = _get_designated_primary($cmd, $res, -1);
my $prosumers = get_prosumers(@_);
_trigger(3);
foreach my $peer (split("\\+", $prosumers)) {
my $lnk = "$mars/resource-$res/actual-$peer/prosumer-on";
my $val = get_link($lnk, 1);
unless ($val) {
lprint "Prosumer at '$peer' is not active\n";
return 0;
}
$lnk = "$mars/resource-$res/actual-$peer/prosumer-peer-path";
$val = get_link($lnk, 1);
return 1 unless $val;
$val =~ m/@(.+?)(?:$|:)/;
my $connected = $1;
lprint "Prosumer at '$peer' is connected with '$connected'\n";
if ($connected eq $primary) {
lprint "No handover necessary for '$peer'\n";
next;
}
$lnk = "$mars/resource-$res/actual-$peer/new-prosumer-activated";
$val = get_link($lnk, 1);
if (!$val) {
lprint "Prosumer handover not yet prepared at '$peer'\n";
return 1;
}
$lnk = "$mars/resource-$res/actual-$peer/new-prosumer-peer-path";
$val = get_link($lnk, 1);
return 1 unless $val;
$val =~ m/@(.+?)(?:$|:)/;
$connected = $1;
lprint "Prosumer handover preparation at '$peer' is connected with '$connected'\n";
unless ($connected eq $primary) {
lprint "Waiting for handover prepare at '$peer'\n";
return 1;
}
lprint "OK handover prepared at '$peer'\n";
}
}
return 0;
}
# activate prosumer-handover
sub primary_phase3c {
my ($cmd, $res) = @_;
if (!$force && !todo_local(@_)) {
my $prosumers = get_prosumers(@_);
foreach my $peer (split("\\+", $prosumers)) {
lprint "Activating prosumer handover at '$peer'\n";
my $lnk = "$mars/resource-$res/todo-$peer/handover-prosumer";
set_link("1", $lnk);
}
finish_links();
$allow_fail_action = \&compensate_primary_fail_prepared;
_trigger(3);
}
}
# wait for prosumer-handover finished
sub primary_phase3d {
my ($cmd, $res) = @_;
return 0 if $force || todo_local(@_);
my $old = $pri_old{$res};
my $new = $host;
my $prosumers = get_prosumers(@_);
foreach my $peer (split("\\+", $prosumers)) {
my $lnk = "$mars/resource-$res/actual-$peer/prosumer-on";
my $val = get_link($lnk, 1);
unless ($val) {
lprint "Prosumer at '$peer' is not active\n";
return 0;
}
$lnk = "$mars/resource-$res/actual-$peer/new-prosumer-activated";
$val = get_link($lnk, 1);
if ($val) {
lprint "Prosumer handover not yet finished at '$peer'\n";
return 1;
}
$lnk = "$mars/resource-$res/actual-$peer/prosumer-peer-path";
$val = get_link($lnk, 1);
unless ($val) {
lwarn "UNEXPECTED: host '$peer' suddenly reports no peer path anymore.\n";
return 1;
}
$val =~ m/@(.+?)(?:$|:)/;
my $connected = $1;
lprint "Prosumer at '$peer' is connected with '$connected'\n";
if ($connected ne $new) {
lwarn "UNEXPECTED: host '$peer' is connected with '$connected' instead of '$new'.\n";
return 1;
}
lprint "Prosumer handover finished at '$peer'\n";
}
# this is time critical
_reset_gate(@_);
finish_links();
_trigger(3);
return 0;
}
# wait for gate closed and reset old primary exports
sub primary_phase3e {
my ($cmd, $res) = @_;
return 0 if $force || todo_local(@_);
# check that gates have actually closed
my $prosumers = get_prosumers(@_);
foreach my $peer (split("\\+", $prosumers)) {
my $lnk = "$mars/resource-$res/actual-$peer/prosumer-on";
my $val = get_link($lnk, 1);
unless ($val) {
lprint "Prosumer at '$peer' is not active\n";
return 0;
}
$lnk = "$mars/resource-$res/actual-$peer/gate-mask";
my $status = get_link($lnk, 1);
if (!defined($status) || $status eq "") {
lwarn "Gate at '$peer' does not respond.\n";
_trigger(3);
return 1;
}
# remove leading 0
$status =~ s/^0x0*(.)/0x$1/;
lprint "Gate at '$peer' is '$status'\n";
if ($status ne "0x0") {
_trigger(3);
return 1;
}
lprint "Gate at '$peer' is open\n";
}
finish_links();
_trigger(3);
$allow_fail_action = undef;
return 0;
}
sub primary_phase3f {
my ($cmd, $res) = @_;
if (!$force && !todo_local(@_)) {
my $old = $pri_old{$res};
my $new = $host;
if ($old ne $new) {
my $lnk = "$mars/resource-$res/actual-$old/is-primary";
my $val = get_link($lnk, 1);
if ($val) {
_trigger(3);
lprint "Old primary '$old' not yet gone\n";
return 1;
}
lprint "Old primary '$old' is gone.\n";
} else {
lprint "Old primary '$old' is the new one.\n";
}
}
_reset_new_primary(@_);
return 0;
}
# wait for device to appear / disappear
sub primary_phase4 {
my ($cmd, $res) = @_;
if($cmd eq "secondary") {
check_mars_device($cmd, $res, 1, 1);
if($cmd =~ m/secondary/) {
if (todo_local(@_)) {
check_mars_device($cmd, $res, 1, 1);
}
return 0;
}
my $ok = detect_splitbrain($res, 1);
@ -6507,7 +6910,17 @@ sub primary_phase4 {
lwarn "\n";
return 0;
}
check_mars_device($cmd, $res, 1, 0);
if (todo_local(@_)) {
check_mars_device($cmd, $res, 1, 0);
} elsif (!$force) {
my $old = $pri_old{$res};
my $new = $host;
lprint "Unexporting old primary '$old' for safety.\n";
my $lnk = "$mars/resource-$res/todo-$old/exports";
set_link("(none)", $lnk);
$lnk = "$mars/resource-$res/todo-$old/multi-prosumer";
set_link("0", $lnk);
}
# new switch semantics, when nothing has failed before: up
up_res_phase1(@_);
return 0;
@ -9798,6 +10211,7 @@ my %cmd_table =
],
\&primary_phase0,
"check preconditions",
"FORK",
\&primary_phase0a,
"conditionally wait for fetch off",
@ -9806,21 +10220,44 @@ my %cmd_table =
"LOOP",
\&primary_phase1,
"leave primary state",
"LOOP",
\&primary_phase1b,
"trigger remote",
"LOOP",
\&primary_phase2,
"wait for cluster when necessary",
"LOOP",
\&primary_phase2b,
"avoid split brain",
\&primary_phase3,
"switch to primary",
"LOOP",
\&primary_phase3b,
"trigger remote",
\&primary_phase3c,
"trigger prosumer handover",
"LOOP",
\&primary_phase3d,
"wait for prosumer handover and open gate",
"LOOP",
\&primary_phase3e,
"wait for gate open and reset old primary exports",
"LOOP",
\&primary_phase3f,
"wait for primary gone",
\&primary_phase4,
"wait for device",
"LOOP",
\&primary_phase5,
"trigger systemd",