diff --git a/kernel/mars_if.c b/kernel/mars_if.c index 0b4a7bec..8c6bbeb5 100644 --- a/kernel/mars_if.c +++ b/kernel/mars_if.c @@ -813,8 +813,8 @@ static int if_switch(struct if_brick *brick) input->q = NULL; } #endif - if (atomic_read(&input->open_count) > 0) { - MARS_INF("device '%s' is open %d times, cannot shutdown\n", disk->disk_name, atomic_read(&input->open_count)); + if (atomic_read(&brick->open_count) > 0) { + MARS_INF("device '%s' is open %d times, cannot shutdown\n", disk->disk_name, atomic_read(&brick->open_count)); status = -EBUSY; goto done; // don't indicate "off" status } @@ -861,14 +861,14 @@ static int if_open(struct block_device *bdev, fmode_t mode) down(&brick->switch_sem); if (unlikely(!brick->power.led_on)) { - MARS_INF("----------------------- BUSY %d ------------------------------\n", atomic_read(&input->open_count)); + MARS_INF("----------------------- BUSY %d ------------------------------\n", atomic_read(&brick->open_count)); up(&brick->switch_sem); return -EBUSY; } - atomic_inc(&input->open_count); + atomic_inc(&brick->open_count); - MARS_INF("----------------------- OPEN %d ------------------------------\n", atomic_read(&input->open_count)); + MARS_INF("----------------------- OPEN %d ------------------------------\n", atomic_read(&brick->open_count)); up(&brick->switch_sem); return 0; @@ -877,14 +877,12 @@ static int if_open(struct block_device *bdev, fmode_t mode) static int if_release(struct gendisk *gd, fmode_t mode) { struct if_input *input = gd->private_data; + struct if_brick *brick = input->brick; int nr; - MARS_INF("----------------------- CLOSE %d ------------------------------\n", atomic_read(&input->open_count)); - - if (atomic_dec_and_test(&input->open_count)) { - struct if_brick *brick; - brick = input->brick; + MARS_INF("----------------------- CLOSE %d ------------------------------\n", atomic_read(&brick->open_count)); + if (atomic_dec_and_test(&brick->open_count)) { while ((nr = atomic_read(&input->flying_count)) > 0) { MARS_INF("%d IO requests not yet completed\n", nr); brick_msleep(1000); @@ -998,6 +996,7 @@ MARS_MAKE_STATICS(if); static int if_brick_construct(struct if_brick *brick) { sema_init(&brick->switch_sem, 1); + atomic_set(&brick->open_count, 0); return 0; } @@ -1022,7 +1021,6 @@ static int if_input_construct(struct if_input *input) INIT_LIST_HEAD(&input->plug_anchor); sema_init(&input->kick_sem, 1); spin_lock_init(&input->req_lock); - atomic_set(&input->open_count, 0); atomic_set(&input->flying_count, 0); atomic_set(&input->read_flying_count, 0); atomic_set(&input->write_flying_count, 0); diff --git a/kernel/mars_if.h b/kernel/mars_if.h index 07b151f7..66681a29 100644 --- a/kernel/mars_if.h +++ b/kernel/mars_if.h @@ -47,7 +47,6 @@ struct if_input { struct timer_list timer; #endif unsigned long capacity; - atomic_t open_count; atomic_t plugged_count; atomic_t flying_count; // only for statistics @@ -78,6 +77,7 @@ struct if_brick { int readahead; bool skip_sync; // inspectable + atomic_t open_count; // private struct semaphore switch_sem; struct say_channel *say_channel; diff --git a/kernel/sy_old/mars_light.c b/kernel/sy_old/mars_light.c index 065b4f40..2e9a1ea7 100644 --- a/kernel/sy_old/mars_light.c +++ b/kernel/sy_old/mars_light.c @@ -3124,7 +3124,7 @@ int make_dev(void *buf, struct mars_dent *dent) } switch_on = - (rot->if_brick && atomic_read(&rot->if_brick->inputs[0]->open_count) > 0) || + (rot->if_brick && atomic_read(&rot->if_brick->open_count) > 0) || (rot->todo_primary && !rot->trans_brick->replay_mode && rot->trans_brick->power.led_on); @@ -3157,6 +3157,7 @@ int make_dev(void *buf, struct mars_dent *dent) } dev_brick->show_status = _show_brick_status; _dev_brick = (void*)dev_brick; + __show_actual(rot->parent_path, "open-count", atomic_read(&_dev_brick->open_count)); #if 0 if (_dev_brick->has_closed) { _dev_brick->has_closed = false; diff --git a/userspace/marsadm b/userspace/marsadm index 7dd03f39..4fcae451 100755 --- a/userspace/marsadm +++ b/userspace/marsadm @@ -267,22 +267,12 @@ sub check_primary_gone { for (;;) { my $pri = _get_actual_primary($res); last if !$pri; + last if $pri eq $host; lprint "waiting for other primary host ($pri) to disappear....\n"; sleep_timeout(); } } -sub check_primary_settled { - my ($res) = @_; - my $target_primary = _get_designated_primary($res); - for (;;) { - my $actual_primary = _get_actual_primary($res) || '(none)'; - last if ($target_primary eq $actual_primary); - lprint "waiting for primary $target_primary to be settled\n"; - sleep_timeout(2); - } -} - sub check_todo { my ($cmd, $res, $key, $val, $wait, $unchecked, $inv) = @_; my $path = "$mars/resource-$res/todo-$host/$key"; @@ -405,74 +395,8 @@ sub get_minmax_replays { return _get_minmax($res, "$mars/resource-$res/replay-*", 1); } -sub check_splitbrain { - # check only the chain of $host (or all hosts if unset) - # check up to $sequence (or for all if < 0) - my ($res, $host, $sequence) = @_; - if ($sequence < 0) { - my $old = _get_actual_primary($res) || "(none)"; - _primary_res($res, "(none)", $old) unless $old eq "(none)"; - finish_links(); - sleep(5); - while (!_check_files_modified_any_of("$mars/resource-$res/{log,version,replay}-*", 60)) { - lprint "resource directory $res not stable, waiting....\n"; - sleep_timeout(); - } - while (1) { - my ($min_log, $max_log) = get_minmax_logfiles($res); - my ($min_ver, $max_ver) = get_minmax_versions($res); - my ($min_rep, $max_rep) = get_minmax_replays($res); - if ($min_ver > $min_log || $max_ver < $max_log) { - lprint "some version links are missing...\n"; - sleep_timeout(); - next; - } - if ($max_log >= $max_rep) { - lprint "resource $res: logfile $max_log is present.\n"; - last; - } - lprint "resource $res: logfile $max_log is not yet transferred (need $max_rep), waiting....\n"; - sleep_timeout(); - } - } # $sequence < 0 - - my $glob = "$mars/resource-$res/version-[0-9]*"; - my @links = glob($glob); - if (!@links) { - @links = glob("$mars/resource-$res/version-[0-9]*-*"); - ldie "no version information available\n" unless @links; - lprint "assuming that I am primary for the first time\n"; - return; - } - - @links = sort(@links); - foreach my $link (@links) { - my $nr = $link; - $nr =~ s:^.*[a-z]+-([0-9]+)(-[^/]*)?$:$1:; - $nr = int($nr); - - next if ($sequence >= 0 && $nr > $sequence); - - my $fromhost = $link; - $fromhost =~ s:^.*version-[0-9]*-(.*)$:$1:; - - my $version = get_link($link); - my $otherhost = $version; - $otherhost =~ s/^[^,]*?,(?:log-[0-9]*?-|)([^,]*?),.*$/$1/; - my $otherlink = sprintf("$mars/resource-$res/version-%09d-$otherhost", $nr); - my $otherversion = get_link($otherlink); - - # ignore foreign mismatches - if ($host) { - next if $fromhost ne $host; - } - - # by defintion, the originator of a logfile is always "right" - next if $otherhost eq $fromhost; - - # final check - ldie "splitbrain at sequence $nr detected\n" unless $version eq $otherversion; - } +sub try_to_avoid_splitbrain { + # NYI } sub get_size { @@ -1112,41 +1036,71 @@ sub _primary_res { lprint "designated primary changed from '$old' to '$new'\n"; } -sub primary_res { +# check whether primary/secondary switching is possible at all +sub primary_phase0 { my ($cmd, $res) = @_; - my $sec = ($cmd eq "secondary"); - # we _must_ take the designated primary here, because the actual primary is a _runtime_ condition + ldie "cannot switch primary: mars kernel module is not loaded\n" unless ($cmd eq "secondary" || -d "/proc/sys/mars"); + if ($cmd eq "primary" and !$force) { + check_sync_finished($res, $host); + check_todo($cmd, $res, "attach", 1, 0); + check_todo($cmd, $res, "connect", 1, 0); + check_todo($cmd, $res, "allow-replay", 1, 0); + #check_status($cmd, $res, "replay_rate", 0, 0, 1); + } + my $old = _get_designated_primary($res); + return if ($old eq $host and $cmd eq "primary"); + return if $old eq "(none)"; + my $device_in_use = get_link("$mars/resource-$res/actual-$old/open-count", 1); + if ($device_in_use) { + my $name = get_link("$mars/resource-$res/device-$old", 1) || "unknown"; + lwarn "device '/dev/mars/$name' for resource '$res' is $device_in_use times in use on primary host '$old'\n"; + ldie "first you must umount/close the device (on host '$old')\n" unless $force; + } + lprint "all preconditions OK for resource '$res'\n"; +} + +# when necessary, switch to secondary (intermediately) +sub primary_phase1 { + my ($cmd, $res) = @_; + return if ($force and $cmd eq "primary"); + my $old = _get_designated_primary($res); + return if ($old eq $host and $cmd eq "primary"); + my $new = "(none)"; + return if $old eq $new; + _primary_res($res, $new, $old); +} + +# when necessary, wait +sub primary_phase2 { + my ($cmd, $res) = @_; + return if $force; + return unless $cmd eq "primary"; + check_primary_gone($res); + try_to_avoid_splitbrain(@_); +} + +# when necessary, switch to primary +sub primary_phase3 { + my ($cmd, $res) = @_; + return unless $cmd eq "primary"; my $old = _get_designated_primary($res); my $new = $host; - if ($sec) { - if ($old eq '(none)') { - lprint "resource '$res' is already designated as secondary everywhere\n"; - return; - } - if (($old ne $host) && !$force && (_get_actual_primary($res) ne $host)) { - ldie "for safety reasons, switching to secondary is only allowed when I ($host) am designated primary or actually primary for resource '$res'\n"; - } - $new = "(none)"; - } elsif ($old eq $new) { - lprint "I am already designated primary on resource '$res'.\n"; - return; - } elsif ($force) { - lprint "FORCING myself ($host) to be the designated primary...\n"; - } elsif (! -d "/proc/sys/mars") { - ldie "cannot switch to primary: mars kernel module is not loaded\n"; - } else { # try to switch myself to primary - lprint "trying to switch $new to primary...\n"; - check_sync_finished($res, $new); - check_todo($cmd, $res, "connect", 1, 0); - _primary_res($res, "(none)", $old) unless $old eq "(none)"; - finish_links(); - check_primary_gone($res); - check_splitbrain($res, $new, -1); - } + return if $old eq $new; _primary_res($res, $new, $old); - finish_links(); - check_primary_settled($res); - lprint "resource '$res': designated primary successfully changed from $old to $new\n"; +} + +# wait for device to appear +sub primary_phase4 { + my ($cmd, $res) = @_; + return unless $cmd eq "primary"; + return if $force; + my $name = get_link("$mars/resource-$res/device-$host"); + my $dev = "/dev/mars/$name"; + while (! -e $dev) { + lprint "device '$dev' not yet present\n"; + sleep_timeout(1); + } + lprint "device '$dev' is present\n" if -b $dev; } sub invalidate_res { @@ -1375,8 +1329,17 @@ my %cmd_table = "syncer" => \&ignore_cmd, "up" => \&up_res, "down" => \&up_res, - "primary" => \&primary_res, - "secondary" => \&primary_res, + "primary" => [ + "check preconditions", \&primary_phase0, + "leave primary state", \&primary_phase1, + "wait when necessary", \&primary_phase2, + "switch to primary", \&primary_phase3, + "wait for device", \&primary_phase4, + ], + "secondary" => [ + "check preconditions", \&primary_phase0, + "leave primary state", \&primary_phase1, + ], "invalidate" => \&invalidate_res, "invalidate-remote" => \&forbidden_cmd, "resize" => \&resize_res,