diff --git a/userspace/marsadm b/userspace/marsadm index b2620291..372ef937 100755 --- a/userspace/marsadm +++ b/userspace/marsadm @@ -3062,7 +3062,7 @@ sub check_primary { my $fail = 0; my $lnk = "$mars/resource-$res/actual-$host/is-primary"; my $is_primary = get_link($lnk, 1); - if (!$is_primary) { # give it a second chance + if (!$is_primary && todo_local($cmd, $res)) { # give it a second chance $is_primary = device_exists($res); } unless ($is_primary) { @@ -5127,10 +5127,15 @@ sub create_res { set_link("1", "$todo/connect"); set_link("1", "$todo/sync"); set_link("1", "$todo/allow-replay"); + set_link("0", "$todo/detach-device"); + set_link("0", "$todo/kill-device"); + set_link("(local)", "$todo/exports"); + set_link("0", "$todo/multi-prosumer"); unlink("$resdir/syncstatus-$host"); my $replay_ok = 1; if ($create) { + set_link("(local)", "$resdir/prosumer"); set_link($host, "$resdir/primary"); set_link($host, "$resdir/userspace/last-primary"); set_link($size, "$resdir/syncstatus-$host"); @@ -6234,6 +6239,8 @@ sub _primary_res { lprint "designated primary changed from '$old' to '$new'\n"; } +my %gate_set; + sub _set_gate { my ($cmd, $res, $peers) = @_; my $touched = 0; @@ -6241,6 +6248,7 @@ sub _set_gate { lprint "Closing gate at '$peer'\n"; my $lnk = "$mars/resource-$res/todo-$peer/gate-mask"; set_link($gate_code, $lnk); + $gate_set{$peer} = $gate_code; } } @@ -6253,6 +6261,9 @@ sub _reset_gate { next if $val eq "0x0"; lprint "Resetting gate of '$res'\n"; set_link("0x0", $lnk); + $lnk =~ m:/todo-([^/]*)/:; + my $peer = $1; + $gate_set{$peer} = 0; } } @@ -6281,13 +6292,28 @@ sub _reset_current_primary { # check whether primary/secondary switching is possible at all sub primary_phase0 { my ($cmd, $res) = @_; - ldie "cannot switch primary: mars kernel module is not loaded\n" unless ($cmd eq "secondary" || -d "/proc/sys/mars"); + ldie "cannot switch primary: mars kernel module is not loaded\n" unless ($cmd =~ m/secondary/ || -d "/proc/sys/mars"); if ($force) { check_todo($cmd, $res, "fetch", 0, 0); } + _reset_gate(@_); + _reset_new_primary(@_); + finish_links(); + _trigger(3); my $old = _get_designated_primary($cmd, $res, -1); + my $new = parse_list_spec($cmd_suffix{$cmd}, undef, $host); + ldie "Cannot work with multiple primaries in parallel\n" if $new =~ m/[+]/; + if ($new && $new ne "(local)") { + # Provisionary: map the new syntax to the old one + if ($new eq "(none)") { + $cmd = "secondary"; + } else { + # provisionary, to disappear + $host = $new; + } + } lprint "Current designated primary: $old\n"; - if ($cmd eq "primary") { + if ($cmd =~ m/primary/) { if ($host ne $old) { lprint "Allowing handover in cases of sync: ignore_sync=$ignore_sync\n" if $ignore_sync; check_sync_finished($res, $host, $ignore_sync); @@ -6315,19 +6341,27 @@ sub primary_phase0 { ldie "Won't switch to avoid unnoticed data loss. You may however do a 'primary --force'.\n" unless $force; } } + if (!todo_local(@_)) { + my $status = _check_logger_states($cmd, $res, $old, $new, 0, 1); + if ($status) { + ldie "Apparently hosts '$old' and '$new' seem to work on different storage ids\n"; + } + } my $want_path = "$mars/resource-$res/systemd-want"; my $want = get_link($want_path, 2); if ($want) { my $new; my $oper; - if ($cmd eq "primary") { + if ($cmd =~ m/primary/) { $new = $host; $oper = "start"; } else { $new = "(none)"; $oper = "stop"; } - set_link($new, $want_path); + if (todo_local($cmd, $res)) { + set_link($new, $want_path); + } my $unit_path = "$mars/resource-$res/systemd-$oper-unit"; my $unit = get_link($unit_path, 2); if ($unit) { @@ -6341,11 +6375,13 @@ sub primary_phase0 { return 0; } } - return 0 if ($old eq $host and $cmd eq "primary"); + return 0 if ($old eq $host && $cmd =~ m/primary/); return 0 if $old eq "(none)"; my $open_count_path = "$mars/resource-$res/actual-$old/open-count"; my $device_in_use = get_link($open_count_path, 1); - if ($device_in_use) { + if ($device_in_use && + ($cmd =~ m/secondary/ || + todo_local(@_))) { my $dev = device_name($res, $old); lwarn "device '$dev' for resource '$res' is $device_in_use times in use on primary host '$old'\n"; ldie "first you must umount/close the device (on host '$old')\n" unless $force; @@ -6368,6 +6404,19 @@ sub primary_phase0a { lprint "Prepare new primary '$new' handover\n"; _switch($cmd, $res, "$mars/resource-$res/todo-$new/fetch", 1); } + if (!todo_local(@_)) { + my $prosumers = get_prosumers(@_); + $prosumers = "(none)" if !$prosumers; + my $lnk = "$mars/resource-$res/new-primary"; + my $new = $host; + lprint "Prepare new primary '$new'\n"; + set_link($new, $lnk); + $lnk = "$mars/resource-$res/todo-$new/exports"; + set_link($prosumers, $lnk); + $lnk = "$mars/resource-$res/todo-$new/multi-prosumer"; + my $val = ($prosumers =~ m/[+]/) ? "1" : "0"; + set_link($val, $lnk); + } finish_links(); return 0 unless $force; wait_cond($cmd, $res, "is-fetch-off"); @@ -6416,36 +6465,218 @@ sub primary_phase0b { return 0; } +sub compensate_primary_fail_prepared { + my ($cmd, $res) = @_; + lprint "Resetting gate and primary handover infrastucture...\n"; + _reset_gate($cmd, $res); + _reset_new_primary($cmd, $res); + finish_links(); +} + +sub compensate_primary_fail_switched { + my ($cmd, $res) = @_; + compensate_primary_fail_prepared(@_); + lprint "Switching back to previous primary...\n"; + _reset_current_primary($cmd, $res); + lprint "Sorry, I cannot prevent SPLIT BRAIN which may follow now.\n"; +} + # when necessary, switch to secondary (intermediately) +# prosumer: use gate instead sub primary_phase1 { my ($cmd, $res) = @_; - return 0 if ($force and $cmd eq "primary"); - my $old = _get_designated_primary($cmd, $res, -1); - return 0 if ($old eq $host and $cmd eq "primary"); + if (!todo_local(@_)) { + finish_links(); + my $prosumers = get_prosumers(@_); + if (!$force && $cmd =~ m/primary/) { + _set_gate($cmd, $res, $prosumers); + } + foreach my $peer (split("\\+", $prosumers)) { + my $detach_device = "0"; + if ($cmd =~ m/primary/) { + $allow_fail_action = \&compensate_primary_fail_prepared; + lprint "Activating device at '$peer'\n"; + } elsif ($cmd =~ m/secondary/) { + $detach_device = "1"; + lprint "Removing device at '$peer'\n"; + } + my $lnk = "$mars/resource-$res/todo-$peer/detach-device"; + set_link($detach_device, $lnk); + } + finish_links(); + return 0 unless $cmd =~ m/secondary/; + } + return 0 if ($force && $cmd =~ m/primary/); + my $old = $pri_old{$res}; + return 0 if ($old eq $host && $cmd =~ m/primary/); my $new = "(none)"; - if (!$force and $cmd eq "primary") { + if (!$force && $cmd =~ m/primary/) { my $status = try_to_avoid_splitbrain($cmd, $res, $old); return $status if $status; } return 0 if $old eq $new; _primary_res($res, $new, $old); + finish_links(); + return 0; +} + +sub _check_logger_states { + my ($cmd, $res, $old, $new, $after_stamp, $only_id) = @_; + my $old_path = "$mars/resource-$res/actual-$old/state-logger"; + my $new_path = "$mars/resource-$res/actual-$new/state-logger"; + # race prevention: stamps must be retrieved _first_ + my $old_stamp = get_link_stamp($old_path); + my $new_stamp = get_link_stamp($new_path); + my $old_state = get_link($old_path); + my $new_state = get_link($new_path); + lprint "Old $old_stamp logger state: '$old_state' at $old\n" if $verbose; + lprint "New $new_stamp logger state: '$new_state' at $new\n" if $verbose; + if (!$old_state || !$new_state) { + lwarn "UNEXPECTED: undefined logger state\n"; + return 1; + } + if ($only_id) { + $old_state =~ m/^[0-9.]+,([^,]+),/; + my $id_old = $1; + $new_state =~ m/^[0-9.]+,([^,]+),/; + my $id_new = $1; + if (!$id_old || !$id_new || $id_new ne $id_old) { + lprint "stor_id '$id_new' != '$id_old'\n"; + return 1; + } + return 0; + } + if ($after_stamp) { + my $status = 0; + my $old_diff = $old_stamp - $after_stamp; + my $new_diff = $new_stamp - $after_stamp; + if ($old_diff <= 0) { + lprint "Old logger stamp is not yet recent ($old_diff s)\n"; + $status = 1; + } + if ($new_diff <= 0) { + lprint "New logger stamp is not yet recent ($new_diff s)\n"; + $status = 1; + } + if ($status) { + _trigger(3); + return 1; + } + lprint "Logger stamps are recent (old=$old_diff, new=$new_diff)\n"; + } + $old_state =~ m/,([0-9])$/; + my $old_dirty = $1; + $new_state =~ m/,([0-9])$/; + my $new_dirty = $1; + if ($old_dirty) { + lprint "Old logger stamp is dirty\n"; + return 1; + } + if ($new_dirty) { + lprint "New logger stamp is dirty\n"; + return 1; + } + # strip off the stor_epoch, they are necessarily different + $old_state =~ s/^[0-9]+\.[0-9]+,//; + $new_state =~ s/^[0-9]+\.[0-9]+,//; + if ($old_state ne $new_state) { + lprint "Logger states are different\n"; + return 1; + } + lprint "Logger states OK\n"; return 0; } my $phase2_waited = 0; +my %gate_stamps; sub primary_phase1b { my ($cmd, $res) = @_; $phase2_waited = 0; finish_links(); return 0 if $force; + my $check_logger = 0; + my $this_stamp = 0; my $old = _get_designated_primary($cmd, $res, -1); - my $status = check_primary_gone($cmd, $res, $old); - return $status if $status; - if (!$force and $cmd eq "primary") { + my $new = $host; + if (todo_local(@_)) { + my $status = check_primary_gone($cmd, $res, $old); + return $status if $status; + } elsif ($cmd =~ m/primary/ && + _get_designated_primary($cmd, $res, 0) ne "(none)") { + # check that gates have actually closed + _trigger(3); + my $prosumers = get_prosumers(@_); + foreach my $peer (split("\\+", $prosumers)) { + my $lnk = "$mars/resource-$res/actual-$peer/gate-on"; + my $status = get_link($lnk, 1); + # When gate is not on, assume OrphanProsumer and ignore it. + # This can happen during incidents. + if (!defined($status) || !$status) { + lprint "Gate at '$peer' is not active.\n"; + if (!$gate_set{$peer}) { + lprint "Ignoring inactive gate at '$peer'.\n"; + next; + } + my $open_count_path = "$mars/resource-$res/actual-$peer/open-count"; + my $device_in_use = get_link($open_count_path, 1); + if (!$device_in_use) { + lprint "Ignoring unused gate at '$peer'.\n"; + next; + } + return 1; + } + $lnk = "$mars/resource-$res/actual-$peer/gate-mask"; + $status = get_link($lnk, 1); + if (!defined($status) || $status eq "") { + lwarn "Gate at '$peer' does not respond.\n"; + delete $gate_stamps{$res}; + return 1; + } + # remove leading 0 + $status =~ s/^0x0*(.)/0x$1/; + lprint "Gate at '$peer' is '$status'\n"; + if ($status ne $gate_code) { + delete $gate_stamps{$res}; + return 1; + } + # Get timestamp of lastly closed gate + if (!defined($gate_stamps{$res})) { + my $stamp = get_link_stamp($lnk); + $this_stamp = $stamp if $stamp > $this_stamp; + } + lprint "Gate at '$peer' is closed.\n"; + } + # Check logger states + # First check validitiy + my $status = _check_logger_states($cmd, $res, $old, $new); + if ($status) { + delete $gate_stamps{$res}; + return $status; + } + # Now remember the first occurence in time + if (!defined($gate_stamps{$res}) && $this_stamp) { + $gate_stamps{$res} = $this_stamp; + } + # Check again, this time also checking the timestamps + $check_logger = 1; + my $after_stamp = $gate_stamps{$res}; + $status = _check_logger_states($cmd, $res, $old, $new, $after_stamp); + return $status if $status; + } + if (!$force && $cmd =~ m/primary/) { my $status = try_to_avoid_splitbrain($cmd, $res, $old); return $status if $status; } + if ($check_logger) { + # paranoia: check once again after split-brain detection + my $after_stamp = $gate_stamps{$res}; + my $status = _check_logger_states($cmd, $res, $old, $new, $after_stamp); + return $status if $status; + } + my $lnk = "$mars/resource-$res/todo-$host/detach-device"; + my $detach_device = ($cmd =~ m/secondary/) ? "1" : "0"; + set_link($detach_device, $lnk); return 0; } @@ -6453,7 +6684,9 @@ sub primary_phase1b { sub primary_phase2 { my ($cmd, $res) = @_; return 0 if $force; - return 0 unless $cmd eq "primary"; + return 0 unless $cmd =~ m/primary/; + finish_links(); + return if !todo_local(@_); wait_cluster($cmd) if !$phase2_waited++; my $old = $pri_old{$res}; return check_primary_gone($cmd, $res, $old); @@ -6472,23 +6705,193 @@ sub primary_phase2b { # when necessary, switch to primary sub primary_phase3 { my ($cmd, $res) = @_; - return 0 unless $cmd eq "primary"; + return 0 unless $cmd =~ m/primary/; my $old = _get_designated_primary($cmd, $res, -1); + $pri_old{$res} = $old; my $new = $host; _primary_res($res, $new, $old); + $allow_fail_action = \&compensate_primary_fail_switched; + my $prosumers = get_prosumers(@_); + if ($prosumers eq "(local)") { + my $lnk = "$mars/resource-$res/todo-$new/detach-device"; + set_link("0", $lnk); + $lnk = "$mars/resource-$res/todo-$new/exports"; + set_link("(local)", $lnk); + $lnk = "$mars/resource-$res/todo-$new/multi-prosumer"; + set_link("0", $lnk); + } return 0; } sub primary_phase3b { + my ($cmd, $res) = @_; finish_links(); + if (!$force && !todo_local(@_)) { + # check that prosumer handover is prepared for the right primary + my $primary = _get_designated_primary($cmd, $res, -1); + my $prosumers = get_prosumers(@_); + _trigger(3); + foreach my $peer (split("\\+", $prosumers)) { + my $lnk = "$mars/resource-$res/actual-$peer/prosumer-on"; + my $val = get_link($lnk, 1); + unless ($val) { + lprint "Prosumer at '$peer' is not active\n"; + return 0; + } + $lnk = "$mars/resource-$res/actual-$peer/prosumer-peer-path"; + $val = get_link($lnk, 1); + return 1 unless $val; + $val =~ m/@(.+?)(?:$|:)/; + my $connected = $1; + lprint "Prosumer at '$peer' is connected with '$connected'\n"; + if ($connected eq $primary) { + lprint "No handover necessary for '$peer'\n"; + next; + } + $lnk = "$mars/resource-$res/actual-$peer/new-prosumer-activated"; + $val = get_link($lnk, 1); + if (!$val) { + lprint "Prosumer handover not yet prepared at '$peer'\n"; + return 1; + } + $lnk = "$mars/resource-$res/actual-$peer/new-prosumer-peer-path"; + $val = get_link($lnk, 1); + return 1 unless $val; + $val =~ m/@(.+?)(?:$|:)/; + $connected = $1; + lprint "Prosumer handover preparation at '$peer' is connected with '$connected'\n"; + unless ($connected eq $primary) { + lprint "Waiting for handover prepare at '$peer'\n"; + return 1; + } + lprint "OK handover prepared at '$peer'\n"; + } + } + return 0; +} + +# activate prosumer-handover +sub primary_phase3c { + my ($cmd, $res) = @_; + if (!$force && !todo_local(@_)) { + my $prosumers = get_prosumers(@_); + foreach my $peer (split("\\+", $prosumers)) { + lprint "Activating prosumer handover at '$peer'\n"; + my $lnk = "$mars/resource-$res/todo-$peer/handover-prosumer"; + set_link("1", $lnk); + } + finish_links(); + $allow_fail_action = \&compensate_primary_fail_prepared; + _trigger(3); + } +} + +# wait for prosumer-handover finished +sub primary_phase3d { + my ($cmd, $res) = @_; + return 0 if $force || todo_local(@_); + my $old = $pri_old{$res}; + my $new = $host; + my $prosumers = get_prosumers(@_); + foreach my $peer (split("\\+", $prosumers)) { + my $lnk = "$mars/resource-$res/actual-$peer/prosumer-on"; + my $val = get_link($lnk, 1); + unless ($val) { + lprint "Prosumer at '$peer' is not active\n"; + return 0; + } + $lnk = "$mars/resource-$res/actual-$peer/new-prosumer-activated"; + $val = get_link($lnk, 1); + if ($val) { + lprint "Prosumer handover not yet finished at '$peer'\n"; + return 1; + } + $lnk = "$mars/resource-$res/actual-$peer/prosumer-peer-path"; + $val = get_link($lnk, 1); + unless ($val) { + lwarn "UNEXPECTED: host '$peer' suddenly reports no peer path anymore.\n"; + return 1; + } + $val =~ m/@(.+?)(?:$|:)/; + my $connected = $1; + lprint "Prosumer at '$peer' is connected with '$connected'\n"; + if ($connected ne $new) { + lwarn "UNEXPECTED: host '$peer' is connected with '$connected' instead of '$new'.\n"; + return 1; + } + lprint "Prosumer handover finished at '$peer'\n"; + } + # this is time critical + _reset_gate(@_); + finish_links(); + _trigger(3); + return 0; +} + +# wait for gate closed and reset old primary exports +sub primary_phase3e { + my ($cmd, $res) = @_; + return 0 if $force || todo_local(@_); + # check that gates have actually closed + my $prosumers = get_prosumers(@_); + foreach my $peer (split("\\+", $prosumers)) { + my $lnk = "$mars/resource-$res/actual-$peer/prosumer-on"; + my $val = get_link($lnk, 1); + unless ($val) { + lprint "Prosumer at '$peer' is not active\n"; + return 0; + } + $lnk = "$mars/resource-$res/actual-$peer/gate-mask"; + my $status = get_link($lnk, 1); + if (!defined($status) || $status eq "") { + lwarn "Gate at '$peer' does not respond.\n"; + _trigger(3); + return 1; + } + # remove leading 0 + $status =~ s/^0x0*(.)/0x$1/; + lprint "Gate at '$peer' is '$status'\n"; + if ($status ne "0x0") { + _trigger(3); + return 1; + } + lprint "Gate at '$peer' is open\n"; + } + finish_links(); + _trigger(3); + $allow_fail_action = undef; + return 0; +} + +sub primary_phase3f { + my ($cmd, $res) = @_; + if (!$force && !todo_local(@_)) { + my $old = $pri_old{$res}; + my $new = $host; + if ($old ne $new) { + my $lnk = "$mars/resource-$res/actual-$old/is-primary"; + my $val = get_link($lnk, 1); + if ($val) { + _trigger(3); + lprint "Old primary '$old' not yet gone\n"; + return 1; + } + lprint "Old primary '$old' is gone.\n"; + } else { + lprint "Old primary '$old' is the new one.\n"; + } + } + _reset_new_primary(@_); return 0; } # wait for device to appear / disappear sub primary_phase4 { my ($cmd, $res) = @_; - if($cmd eq "secondary") { - check_mars_device($cmd, $res, 1, 1); + if($cmd =~ m/secondary/) { + if (todo_local(@_)) { + check_mars_device($cmd, $res, 1, 1); + } return 0; } my $ok = detect_splitbrain($res, 1); @@ -6507,7 +6910,17 @@ sub primary_phase4 { lwarn "\n"; return 0; } - check_mars_device($cmd, $res, 1, 0); + if (todo_local(@_)) { + check_mars_device($cmd, $res, 1, 0); + } elsif (!$force) { + my $old = $pri_old{$res}; + my $new = $host; + lprint "Unexporting old primary '$old' for safety.\n"; + my $lnk = "$mars/resource-$res/todo-$old/exports"; + set_link("(none)", $lnk); + $lnk = "$mars/resource-$res/todo-$old/multi-prosumer"; + set_link("0", $lnk); + } # new switch semantics, when nothing has failed before: up up_res_phase1(@_); return 0; @@ -9798,6 +10211,7 @@ my %cmd_table = ], \&primary_phase0, "check preconditions", + "FORK", \&primary_phase0a, "conditionally wait for fetch off", @@ -9806,21 +10220,44 @@ my %cmd_table = "LOOP", \&primary_phase1, "leave primary state", + "LOOP", \&primary_phase1b, "trigger remote", + "LOOP", \&primary_phase2, "wait for cluster when necessary", + "LOOP", \&primary_phase2b, "avoid split brain", + \&primary_phase3, "switch to primary", + + "LOOP", \&primary_phase3b, "trigger remote", + + \&primary_phase3c, + "trigger prosumer handover", + + "LOOP", + \&primary_phase3d, + "wait for prosumer handover and open gate", + + "LOOP", + \&primary_phase3e, + "wait for gate open and reset old primary exports", + + "LOOP", + \&primary_phase3f, + "wait for primary gone", + \&primary_phase4, "wait for device", + "LOOP", \&primary_phase5, "trigger systemd",