fix network race on primary/secondary switch

The race could lead to unnecessary split brain.
Now we wait until everything in the resource directory became stable
for at least 60 seconds. This may be somewhat too conservative
when having k>2 participants on the same resource.
This commit is contained in:
Thomas Schoebel-Theuer 2012-01-24 15:12:55 +01:00 committed by Thomas Schoebel-Theuer
parent fd0309fee9
commit 81bfb9471a
2 changed files with 35 additions and 2 deletions

View File

@ -1772,7 +1772,7 @@ int make_log_step(void *buf, struct mars_dent *dent)
*/
prev_log = rot->next_log;
if (prev_log && prev_log->d_serial + 1 != dent->d_serial) {
MARS_ERR("transaction logs are not consecutive at '%s' (%d ~> %d)\n", dent->d_path, prev_log->d_serial, dent->d_serial);
MARS_WRN("transaction logs are not consecutive at '%s' (%d ~> %d)\n", dent->d_path, prev_log->d_serial, dent->d_serial);
status = -EINVAL;
goto done;
}

View File

@ -133,11 +133,44 @@ sub check_status {
}
}
sub _check_mtime {
my ($path, $age) = @_;
my $mt = (lstat($path))[9];
if(!$mt) {
return 0;
}
my $res = ($mt < time() - $age);
#print "XXX '$path' $res\n";
return $res;
}
sub _check_all_mtimes {
my ($path, $age) = @_;
my @list = glob($path);
my $res = 1;
foreach my $p (@list) {
if(!_check_mtime($p, $age)) {
$res = 0;
}
}
return $res;
}
sub check_splitbrain {
my ($res, $host) = @_;
for(;;) {
my $pri = "$mars/resource-$res/primary";
my $old = readlink($pri) or die "cannot determine current primary\n";
_primary_res($res, "(none)", $pri, $old) unless $old eq "(none)";
_trigger();
sleep(5);
last if _check_all_mtimes("$mars/resource-$res/[lvr]*", 60);
print "resource directory $res not stable, waiting....\n";
sleep(5);
}
my @links = glob("$mars/resource-$res/version-[0-9]*-$host");
if(!@links) {
my @links = glob("$mars/resource-$res/version-[0-9]*-*");
@links = glob("$mars/resource-$res/version-[0-9]*-*");
die "no version information available\n" unless @links;
print "assuming that I am primary for the first time\n";
return;