Merge branch 'mars0.1.y' into mars0.1a.y

This commit is contained in:
Thomas Schoebel-Theuer 2018-10-15 07:24:03 +02:00
commit 26ddf5e68c
7 changed files with 5881 additions and 28 deletions

View File

@ -296,6 +296,16 @@ Hint: branch 0.1a will get a merge from here, and then get the
(except Football related ones) will then go to 0.1b.
Finally, when 0.1a is stable, I will close this branch.
mars0.1stable63
* Minor fix: when compiling for some newer kernels (only there),
schedule() could be called during wait for some condition,
worsening performance unnecessarily.
* Minor improvement: starting join-resource in batches
was slow because each was waiting for cluster communication.
Use a manual "marsadm wait-cluster" before starting batches
of join-resource operations.
* Doc: some clarifications on BigCluster scalability behaviour.
mars0.1stable62
* Minor fix: race between join-resource and log-rotate.
* Minor fix: report split brain logfile amount only when

File diff suppressed because it is too large Load Diff

Binary file not shown.

69
kernel/brick_wait.h Normal file
View File

@ -0,0 +1,69 @@
/*
* MARS Long Distance Replication Software
*
* This file is part of MARS project: http://schoebel.github.io/mars/
*
* Copyright (C) 2010-2014 Thomas Schoebel-Theuer
* Copyright (C) 2011-2014 1&1 Internet AG
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License along
* with this program; if not, write to the Free Software Foundation, Inc.,
* 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
*/
#ifndef BRICK_WAIT_H
#define BRICK_WAIT_H
/* compat to some elder kernels...
*/
#ifndef ___wait_cond_timeout
#define ___wait_cond_timeout(x) (x)
#define prepare_to_wait_event(a,b,c) (prepare_to_wait(a, b, c), 0)
#endif
/* Some code stolen from include/linux/wait.h
*/
#define brick_wait(wq, condition, timeout) \
({ \
__label__ __out; \
wait_queue_t __wait; \
long __ret = timeout; /* explicit shadow */ \
\
might_sleep(); \
/* check in advance to avoid spinlocks in fastpath */ \
if (condition) \
goto __out; \
\
INIT_LIST_HEAD(&__wait.task_list); \
__wait.flags = 0; \
\
for (;;) { \
long __int = prepare_to_wait_event(&wq, &__wait, TASK_INTERRUPTIBLE); \
\
if (__int) { \
__ret = __int; \
break; \
} \
\
__ret = schedule_timeout(__ret); \
\
__set_current_state(TASK_RUNNING); \
if (___wait_cond_timeout(condition)) \
break; \
} \
finish_wait(&wq, &__wait); \
__out: __ret; \
})
#endif

View File

@ -245,27 +245,19 @@ void copy_endio(struct generic_callback *cb)
error = -EINVAL;
goto exit;
}
st->active[queue] = false;
if (unlikely(st->table[queue])) {
if (unlikely(st->table[queue] != mref)) {
MARS_ERR("table corruption at %d %d (%p => %p)\n", index, queue, st->table[queue], mref);
error = -EEXIST;
goto exit;
}
if (unlikely(cb->cb_error < 0)) {
error = cb->cb_error;
__clear_mref(brick, mref, queue);
/* This is racy, but does no harm.
* Worst case just produces more error output.
*/
if (!brick->copy_error_count++) {
MARS_WRN("IO error %d on index %d, old state = %d\n", cb->cb_error, index, st->state);
}
} else {
if (unlikely(st->table[queue])) {
MARS_ERR("overwriting index %d, state = %d\n", index, st->state);
_clear_mref(brick, index, queue);
}
st->table[queue] = mref;
}
exit:
@ -273,6 +265,7 @@ exit:
st->error = error;
_clash(brick);
}
st->active[queue] = false;
if (mref->ref_rw) {
atomic_dec(&brick->copy_write_flight);
atomic_dec(&global_copy_write_flight);
@ -294,6 +287,7 @@ int _make_mref(struct copy_brick *brick, int index, int queue, void *data, loff_
struct mref_object *mref;
struct copy_mref_aspect *mref_a;
struct copy_input *input;
struct copy_state *st;
int offset;
int len;
int status = -EAGAIN;
@ -353,7 +347,10 @@ int _make_mref(struct copy_brick *brick, int index, int queue, void *data, loff_
//MARS_IO("queue = %d index = %d pos = %lld len = %d rw = %d\n", queue, index, mref->ref_pos, mref->ref_len, rw);
GET_STATE(brick, index).active[queue] = true;
st = &GET_STATE(brick, index);
st->table[queue] = mref;
st->active[queue] = true;
if (rw) {
atomic_inc(&brick->copy_write_flight);
atomic_inc(&global_copy_write_flight);
@ -442,8 +439,6 @@ restart:
goto idle;
}
_clear_mref(brick, index, 1);
_clear_mref(brick, index, 0);
st->writeout = false;
st->error = 0;
@ -475,17 +470,16 @@ restart:
next_state = COPY_STATE_READ2;
/* fallthrough */
case COPY_STATE_READ2:
mref1 = st->table[1];
if (!mref1) { // idempotence: wait by unchanged state
if (st->active[1]) { // idempotence: wait by unchanged state
goto idle;
}
/* fallthrough => wait for both mrefs to appear */
case COPY_STATE_READ1:
case COPY_STATE_READ3:
mref0 = st->table[0];
if (!mref0) { // idempotence: wait by unchanged state
if (st->active[0]) { // idempotence: wait by unchanged state
goto idle;
}
mref0 = st->table[0];
if (brick->copy_limiter) {
int amount = (mref0->ref_len - 1) / 1024 + 1;
mars_limit_sleep(brick->copy_limiter, amount);
@ -565,6 +559,11 @@ restart:
progress = -EILSEQ;
break;
}
if (unlikely(st->active[0])) {
MARS_ERR("src buffer for write is active, state %d at index %d\n", state, index);
progress = -EILSEQ;
break;
}
if (unlikely(brick->is_aborting)) {
progress = -EINTR;
break;
@ -587,8 +586,7 @@ restart:
next_state = COPY_STATE_WRITTEN;
/* fallthrough */
case COPY_STATE_WRITTEN:
mref1 = st->table[1];
if (!mref1) { // idempotence: wait by unchanged state
if (st->active[1]) { // idempotence: wait by unchanged state
MARS_IO("irrelevant\n");
goto idle;
}

View File

@ -36,6 +36,7 @@
#include <linux/string.h>
#include <linux/bio.h>
#include "brick_wait.h"
#include "mars.h"
#include "lib_limiter.h"
@ -735,7 +736,7 @@ int _write_ref_get(struct trans_logger_output *output, struct trans_logger_mref_
#ifdef DELAY_CALLERS
// delay in case of too many master shadows / memory shortage
wait_event_interruptible_timeout(brick->caller_event,
brick_wait(brick->caller_event,
!brick->delay_callers &&
(brick_global_memlimit < 1024 || atomic64_read(&global_mshadow_used) / 1024 < brick_global_memlimit),
HZ / 2);
@ -2300,7 +2301,7 @@ int _do_ranking(struct trans_logger_brick *brick)
}
} else if (brick->delay_callers) {
brick->delay_callers = false;
wake_up_interruptible(&brick->caller_event);
wake_up_interruptible_all(&brick->caller_event);
}
// global limit for flying mrefs
@ -2605,7 +2606,7 @@ void trans_logger_log(struct trans_logger_brick *brick)
int winner;
int nr;
wait_event_interruptible_timeout(
brick_wait(
brick->worker_event,
({
winner = _do_ranking(brick);
@ -2771,7 +2772,7 @@ void wait_replay(struct trans_logger_brick *brick, struct trans_logger_mref_aspe
bool ok = false;
bool was_empty;
wait_event_interruptible_timeout(brick->worker_event,
brick_wait(brick->worker_event,
atomic_read(&brick->replay_count) < max
&& (_has_conflict(brick, mref_a) ? conflicts++ : (ok = true), ok),
60 * HZ);

View File

@ -1697,7 +1697,6 @@ sub try_to_avoid_splitbrain {
my $old_timeout = $timeout;
$timeout = $window if $timeout < 0;
$old_primary = "" if $old_primary eq "(none)";
wait_cluster($cmd, $res, $old_primary);
if (!detect_splitbrain($res, 0)) {
lwarn "ATTENTION: you are starting a non-forced primary switchover in a split brain situation.\n";
lwarn "ATTENTION: that's no good idea.\n";