Merge branch 'mars0.1.y' into mars0.1a.y

This commit is contained in:
Thomas Schoebel-Theuer 2019-02-12 11:26:58 +01:00
commit 5ad1645de9
11 changed files with 147 additions and 59 deletions

View File

@ -342,6 +342,35 @@ Attention! This branch will go EOL around February 2019.
And even more stable, although the 0.1a releases were
called "beta" up to now.
mars0.1stable68
* Minor fix: detach could hang under very weird circumstances.
* Minor fix: in extremly rare cases and under further conditions,
detach could hang due to a race.
Workaround was possible by re-attaching.
* Minor improvement: /dev/mars/mydata now disappears only after
writeback has finished. Although the old behaviour was correct,
certain userspace tool could have erronously concluded that
the primary has finished working. The new bevaiour is
hopefully more like to user expectance.
* Minor improvement: propagate physical and logical sector
sizes from the underlying disk to /dev/mars/mydata.
This can affects mkfs and other tools for making better
decisions about their internal parameters.
* Minor safeguard: disallow manual --ignore-sync override
when the target primary is inconsistent, only relevant
for (non-existent) sysadmins who absolutely don't know what
they are doing when they are combining this with --force.
Systemadmins who really know what they are doing can use
fake-sync in front of it, and then they are explicitly stating
once again that they really want to force a defective system,
and that they really know the fact that it is defective.
* Minor improvement: propagate logical and physical
sector sizes from the underlying disk to the virtual
mars device.
* Minor improvement: additional warning when network connections
are interrupted (asymmetrically), such as by mis-configuration
of network interfaces / routing / firewall rules / etc.
mars0.1stable67
* Minor fix: don't unnecessarily alert sysadmins when no systemd
unit files are installed.

View File

@ -24,45 +24,34 @@
#ifndef BRICK_WAIT_H
#define BRICK_WAIT_H
/* compat to some elder kernels...
/* Try to abstract from changes of the upstream kernel
* by using a hopefully stable interface.
*/
#ifndef ___wait_cond_timeout
#define ___wait_cond_timeout(x) (x)
#define prepare_to_wait_event(a,b,c) (prepare_to_wait(a, b, c), 0)
#endif
/* Some code stolen from include/linux/wait.h
*/
#define brick_wait(wq, condition, timeout) \
#define brick_wait(wq, flag, condition, timeout) \
({ \
__label__ __out; \
wait_queue_t __wait; \
long __ret = timeout; /* explicit shadow */ \
unsigned long __tmout = (timeout); \
\
might_sleep(); \
/* check in advance to avoid spinlocks in fastpath */ \
if (condition) \
goto __out; \
\
INIT_LIST_HEAD(&__wait.task_list); \
__wait.flags = 0; \
\
for (;;) { \
long __int = prepare_to_wait_event(&wq, &__wait, TASK_INTERRUPTIBLE); \
\
if (__int) { \
__ret = __int; \
(flag) = false; \
smp_wmb(); \
while (!(condition)) { \
__tmout = wait_event_interruptible_timeout( \
wq, \
({ smp_rmb(); (flag); }), \
__tmout); \
if (__tmout <= 1) \
break; \
(flag) = false; \
smp_wmb(); \
} \
\
__ret = schedule_timeout(__ret); \
\
__set_current_state(TASK_RUNNING); \
if (___wait_cond_timeout(condition)) \
break; \
} \
finish_wait(&wq, &__wait); \
__out: __ret; \
__tmout; \
})
#define brick_wake(wq, flag) \
({ \
(flag) = true; \
smp_wmb(); \
wake_up_interruptible_all(wq); \
})

View File

@ -31,6 +31,7 @@
//#define IO_DEBUGGING
#include "lib_log.h"
#include "brick_wait.h"
atomic_t global_mref_flying = ATOMIC_INIT(0);
EXPORT_SYMBOL_GPL(global_mref_flying);
@ -133,8 +134,8 @@ void log_write_endio(struct generic_callback *cb)
put_log_cb_info(cb_info);
atomic_dec(&logst->mref_flying);
atomic_dec(&global_mref_flying);
if (logst->signal_event)
wake_up_interruptible(logst->signal_event);
if (logst->signal_event && logst->signal_flag)
brick_wake(logst->signal_event, *(logst->signal_flag));
return;

View File

@ -253,6 +253,7 @@ int log_scan(void *buf, int len, loff_t file_pos, int file_offset, bool sloppy,
struct log_status {
// interfacing
wait_queue_head_t *signal_event;
bool *signal_flag;
// tunables
loff_t start_pos;
loff_t end_pos;

View File

@ -284,6 +284,7 @@ static int bio_get_info(struct bio_output *output, struct mars_info *info)
{
struct bio_brick *brick = output->brick;
struct inode *inode;
struct request_queue *q;
int status = 0;
if (unlikely(!brick->mf ||
@ -296,6 +297,11 @@ static int bio_get_info(struct bio_output *output, struct mars_info *info)
info->tf_align = 512;
info->tf_min_size = 512;
q = bdev_get_queue(inode->i_bdev);
if (q) {
info->tf_align = queue_physical_block_size(q);
info->tf_min_size = queue_logical_block_size(q);
}
brick->total_size = i_size_read(inode);
info->current_size = brick->total_size;
MARS_DBG("determined device size = %lld\n", info->current_size);

View File

@ -913,17 +913,16 @@ loff_t if_get_capacity(struct if_brick *brick)
* device than physically.
*/
if (brick->dev_size <= 0) {
struct mars_info info = {};
struct if_input *input = brick->inputs[0];
int status;
status = GENERIC_INPUT_CALL(input, mars_get_info, &info);
status = GENERIC_INPUT_CALL(input, mars_get_info, &brick->info);
if (unlikely(status < 0)) {
MARS_ERR("cannot get device info, status=%d\n", status);
return 0;
}
MARS_INF("determined default capacity: %lld bytes\n", info.current_size);
brick->dev_size = info.current_size;
MARS_INF("determined default capacity: %lld bytes\n", brick->info.current_size);
brick->dev_size = brick->info.current_size;
}
return brick->dev_size;
}
@ -1032,8 +1031,17 @@ static int if_switch(struct if_brick *brick)
blk_queue_max_segment_size(q, USE_MAX_SEGMENT_SIZE);
#endif
#ifdef USE_LOGICAL_BLOCK_SIZE
if (brick->info.tf_align >= 512) {
MARS_DBG("blk_queue_physical_block_size(%d)\n", brick->info.tf_align);
blk_queue_physical_block_size(q, brick->info.tf_align);
}
if (brick->info.tf_min_size >= 512) {
MARS_DBG("blk_queue_logical_block_size(%d)\n", brick->info.tf_min_size);
blk_queue_logical_block_size(q, brick->info.tf_min_size);
} else {
MARS_DBG("blk_queue_logical_block_size()\n");
blk_queue_logical_block_size(q, USE_LOGICAL_BLOCK_SIZE);
}
#endif
#ifdef USE_SEGMENT_BOUNDARY
MARS_DBG("blk_queue_segment_boundary()\n");

View File

@ -110,6 +110,7 @@ struct if_brick {
// private
struct semaphore switch_sem;
struct say_channel *say_channel;
struct mars_info info;
};
MARS_TYPES(if);

View File

@ -737,6 +737,7 @@ int _write_ref_get(struct trans_logger_output *output, struct trans_logger_mref_
#ifdef DELAY_CALLERS
// delay in case of too many master shadows / memory shortage
brick_wait(brick->caller_event,
brick->caller_flag,
!brick->delay_callers &&
(brick_global_memlimit < 1024 || atomic64_read(&global_mshadow_used) / 1024 < brick_global_memlimit),
HZ / 2);
@ -979,7 +980,7 @@ void _trans_logger_endio(struct generic_callback *cb)
atomic_dec(&brick->any_fly_count);
atomic_inc(&brick->total_cb_count);
wake_up_interruptible_all(&brick->worker_event);
brick_wake(&brick->worker_event, brick->worker_flag);
return;
err:
@ -1021,7 +1022,7 @@ void trans_logger_ref_io(struct trans_logger_output *output, struct mref_object
atomic_inc(&brick->inner_balance_count);
qq_mref_insert(&brick->q_phase[0], mref_a);
wake_up_interruptible_all(&brick->worker_event);
brick_wake(&brick->worker_event, brick->worker_flag);
return;
}
@ -1181,7 +1182,7 @@ void wb_endio(struct generic_callback *cb)
}
done:
wake_up_interruptible_all(&brick->worker_event);
brick_wake(&brick->worker_event, brick->worker_flag);
return;
err:
@ -1576,7 +1577,7 @@ void phase0_endio(void *private, int error)
qq_deactivate(&brick->q_phase[0]);
wake_up_interruptible_all(&brick->worker_event);
brick_wake(&brick->worker_event, brick->worker_flag);
return;
err:
MARS_ERR("giving up...\n");
@ -1784,7 +1785,7 @@ void phase1_endio(struct generic_callback *cb)
// queue up for the next phase
qq_wb_insert(&brick->q_phase[2], wb);
qq_deactivate(&brick->q_phase[1]);
wake_up_interruptible_all(&brick->worker_event);
brick_wake(&brick->worker_event, brick->worker_flag);
return;
err:
@ -1852,7 +1853,7 @@ bool phase1_startio(struct trans_logger_mref_aspect *orig_mref_a)
#endif
qq_wb_insert(&brick->q_phase[3], wb);
qq_deactivate(&brick->q_phase[1]);
wake_up_interruptible_all(&brick->worker_event);
brick_wake(&brick->worker_event, brick->worker_flag);
}
done:
@ -1878,7 +1879,7 @@ void _phase2_endio(struct writeback_info *wb)
// queue up for the next phase
qq_wb_insert(&brick->q_phase[3], wb);
wake_up_interruptible_all(&brick->worker_event);
brick_wake(&brick->worker_event, brick->worker_flag);
return;
}
@ -1992,7 +1993,7 @@ bool phase2_startio(struct writeback_info *wb)
ok = false;
}
}
wake_up_interruptible_all(&brick->worker_event);
brick_wake(&brick->worker_event, brick->worker_flag);
} else {
_phase2_endio(wb);
}
@ -2035,7 +2036,7 @@ void phase3_endio(struct generic_callback *cb)
qq_deactivate(&brick->q_phase[3]);
wake_up_interruptible_all(&brick->worker_event);
brick_wake(&brick->worker_event, brick->worker_flag);
return;
@ -2110,7 +2111,7 @@ int run_mref_queue(struct logger_queue *q, bool (*startio)(struct trans_logger_m
done:
if (found) {
mars_limit(&global_writeback.limiter, (total_len - 1) / 1024 + 1);
wake_up_interruptible_all(&brick->worker_event);
brick_wake(&brick->worker_event, brick->worker_flag);
}
return res;
}
@ -2144,7 +2145,7 @@ int run_wb_queue(struct logger_queue *q, bool (*startio)(struct writeback_info *
done:
if (found) {
mars_limit(&global_writeback.limiter, (total_len - 1) / 1024 + 1);
wake_up_interruptible_all(&brick->worker_event);
brick_wake(&brick->worker_event, brick->worker_flag);
}
return res;
}
@ -2301,7 +2302,7 @@ int _do_ranking(struct trans_logger_brick *brick)
}
} else if (brick->delay_callers) {
brick->delay_callers = false;
wake_up_interruptible_all(&brick->caller_event);
brick_wake(&brick->caller_event, brick->caller_flag);
}
// global limit for flying mrefs
@ -2413,6 +2414,7 @@ void _init_input(struct trans_logger_input *input, loff_t start_pos, loff_t end_
init_logst(logst, (void*)input, start_pos, end_pos);
logst->signal_event = &brick->worker_event;
logst->signal_flag = &brick->worker_flag;
logst->align_size = CONF_TRANS_ALIGN;
logst->chunk_size = CONF_TRANS_CHUNKSIZE;
logst->max_size = CONF_TRANS_MAX_MREF_SIZE;
@ -2608,6 +2610,7 @@ void trans_logger_log(struct trans_logger_brick *brick)
brick_wait(
brick->worker_event,
brick->worker_flag,
({
winner = _do_ranking(brick);
MARS_IO("winner = %d\n", winner);
@ -2733,7 +2736,7 @@ void replay_endio(struct generic_callback *cb)
MARS_ERR("callback with empty replay_head (replay_count=%d)\n", atomic_read(&brick->replay_count));
}
wake_up_interruptible_all(&brick->worker_event);
brick_wake(&brick->worker_event, brick->worker_flag);
return;
err:
MARS_FAT("cannot handle replay IO\n");
@ -2773,6 +2776,7 @@ void wait_replay(struct trans_logger_brick *brick, struct trans_logger_mref_aspe
bool was_empty;
brick_wait(brick->worker_event,
brick->worker_flag,
atomic_read(&brick->replay_count) < max
&& (_has_conflict(brick, mref_a) ? conflicts++ : (ok = true), ok),
60 * HZ);
@ -3001,7 +3005,10 @@ void trans_logger_replay(struct trans_logger_brick *brick)
((long long)jiffies) - old_jiffies >= HZ * 3) &&
finished_pos >= 0) {
// for safety, wait until the IO queue has drained.
wait_event_interruptible_timeout(brick->worker_event, atomic_read(&brick->replay_count) <= 0, 30 * HZ);
brick_wait(brick->worker_event,
brick->worker_flag,
atomic_read(&brick->replay_count) <= 0,
30 * HZ);
if (unlikely(brick->disk_io_error)) {
@ -3023,7 +3030,10 @@ void trans_logger_replay(struct trans_logger_brick *brick)
MARS_INF("waiting for finish...\n");
wait_event_interruptible_timeout(brick->worker_event, atomic_read(&brick->replay_count) <= 0, 60 * HZ);
brick_wait(brick->worker_event,
brick->worker_flag,
atomic_read(&brick->replay_count) <= 0,
60 * HZ);
if (unlikely(finished_pos > brick->replay_end_pos)) {
MARS_ERR("finished_pos too large: %lld + %d = %lld > %lld\n", input->logst.log_pos, input->logst.offset, finished_pos, brick->replay_end_pos);

View File

@ -231,6 +231,8 @@ struct trans_logger_brick {
struct logger_queue q_phase[LOGGER_QUEUES];
struct rank_data rkd[LOGGER_QUEUES];
bool delay_callers;
bool caller_flag;
bool worker_flag;
};
struct trans_logger_output {

View File

@ -1576,6 +1576,13 @@ int __make_copy(
for (i = 0; i < 2; i++) {
struct mars_brick *aio;
/* do not change names underway */
if (copy && copy->inputs[i] && copy->inputs[i]->connect) {
aio = copy->inputs[i]->connect->brick;
if (aio && aio->power.button)
goto found;
}
cc.argv[i] = argv[i];
if (parent) {
cc.fullpath[i] = path_make("%s/%s", parent, argv[i]);
@ -1604,7 +1611,10 @@ int __make_copy(
make_msg(msg_pair, "cannot instantiate '%s'", cc.fullpath[i]);
goto done;
}
found:
cc.output[i] = aio->outputs[0];
/* When switching off, use a short timeout for aborting.
* Important on very slow networks (since a large number
* of requests may be pending).
@ -4392,8 +4402,10 @@ int make_dev(void *buf, struct mars_dent *dent)
switch_on =
(rot->if_brick && atomic_read(&rot->if_brick->open_count) > 0) ||
(rot->todo_primary &&
rot->trans_brick &&
!rot->trans_brick->replay_mode &&
rot->trans_brick->power.led_on &&
(rot->trans_brick->power.led_on ||
(!rot->trans_brick->power.button && !rot->trans_brick->power.led_off)) &&
_check_allow(global, dent->d_parent, "attach"));
if (!global->global_power.button) {
switch_on = false;

View File

@ -738,19 +738,24 @@ sub finish_links {
while (my $link = shift @link_list) {
my $link_tmp = to_tmp($link);
my $target = readlink($link_tmp);
next unless defined($target);
my $this_timestamp = $timestamp;
# allow overriding of secondaries in partitioned clusters by use of small timestamps
if ($target eq "(none)") {
my @stat = lstat($link);
$this_timestamp = $stat[9] + 1 if @stat;
}
system("touch -h -d \"\@$this_timestamp\" $link_tmp") == 0 or ldie "cannot set mtime on symlink '$link_tmp'\n";
unless (system("touch -h -d \"\@$this_timestamp\" $link_tmp") == 0) {
lwarn "cannot set mtime on symlink '$link_tmp'\n";
}
if ($dry_run) {
lprint "DRY_RUN: would create symlink '$link' -> '$target'\n";
unlink($link_tmp);
next;
}
rename($link_tmp, $link) or ldie "cannot finalize symlink '$link'\n";
unless (rename($link_tmp, $link)) {
lwarn "cannot finalize symlink '$link'\n";
}
if ($verbose) {
lprint "created symlink '$link' -> '$target'\n";
}
@ -781,6 +786,7 @@ sub get_alive_links {
my $res = shift || "all";
my $alive = shift || "alive";
my $hosts = shift || "*";
my $warn = shift || 0;
$res = "*" if $res eq "all";
my %cand;
foreach my $path (glob("$mars/ips/ip-$hosts")) {
@ -801,6 +807,15 @@ sub get_alive_links {
# peer must be participating in the same resources
my @other = glob("$mars/resource-$res/data-$peer");
next unless @other;
# I must be participating in some of the _same_ resources
my $common = 0;
foreach my $check (@other) {
my $self = `dirname $check`;
chomp $self;
$self .= "/data-$host";
$common++ if -e $self;
}
next unless $common;
# OK: remember peer
$peers{$peer} = 1;
}
@ -817,6 +832,18 @@ sub get_alive_links {
my $peer = $1;
$links{$peer} = get_link($path, 1);
}
if ($warn) {
foreach my $peer (keys(%peers)) {
my $path = "$mars/$alive-$peer";
if (!is_link_recent($path)) {
my $stamp = get_link_stamp($path);
my $age = seconds2human(mars_time() - $stamp);
my $msg = "no metadata is arriving from peer '$peer', age = $age";
$msg .= " => check your network setup" if is_module_loaded();
lwarn "$msg\n";
}
}
}
return %links;
}
@ -1083,6 +1110,7 @@ sub check_sync_finished {
if ($peer eq $host) {
lwarn "Don't try to make inconsistent host '$host' the new primary!\n";
lwarn "Please wait until sync has finished and all logfile have been replayed.\n";
ldie "Refusing to switch inconsistent host '$host' to primary\n";
} else {
lwarn "Changing the primary role during sync is dangerous for data consistency on host '$peer'!\n";
}
@ -6555,6 +6583,7 @@ if ($cmd =~ m/^(view|pretty)/) {
view_cmd($global_macro_name, "", @args) if $global_macro;
}
finish_links();
get_alive_links("all", "alive", "*", 1);
exit($error_count);
}