all: IO scheduling improvements, tuning

Dastically boost random write performance on RAID controllers with BBUs.
Writeback is only performed when there is no IO contention / starvation.
The old IO contention controller was suited to workstations.
Now server loads are well-controlled even when BBUs are present.
This commit is contained in:
Thomas Schoebel-Theuer 2012-10-15 16:35:36 +02:00 committed by Thomas Schoebel-Theuer
parent d510dd1a3b
commit c319230fa1
13 changed files with 596 additions and 338 deletions

View File

@ -11,7 +11,6 @@
atomic_t q_total; \
/* tunables */ \
int q_batchlen; \
int q_max_flying; \
int q_io_prio; \
bool q_ordering; \
/* private */ \

View File

@ -11,28 +11,27 @@
#include "mars.h"
#include "lib_rank.h"
void ranking_compute(struct rank_data *rkd, const struct rank_info rki[], int rki_count, int x)
void ranking_compute(struct rank_data *rkd, const struct rank_info rki[], int x)
{
int i;
MARS_IO("rki_count = %d at x = %d\n", rki_count, x);
BUG_ON(rki_count < 2);
rki_count--;
for (i = 0; i < rki_count; i++) {
for (i = 0; ; i++) {
int x0 = rki[i].rki_x;
int x1;
int y0;
int y1;
int points;
if (x0 == RKI_DUMMY)
break;
if (x < x0 && i+1 < rki_count)
if (x < x0)
continue;
x1 = rki[i+1].rki_x;
BUG_ON(x1 == x0);
if (x1 == RKI_DUMMY)
break;
y0 = rki[i].rki_y;
y1 = rki[i+1].rki_y;
@ -59,7 +58,7 @@ int ranking_select(struct rank_data rkd[], int rkd_count)
int rest = tmp->rkd_current_points;
if (rest <= 0)
continue;
rest -= tmp->rkd_got;
//rest -= tmp->rkd_got;
if (rest > max) {
max = rest;
res = i;
@ -69,9 +68,9 @@ int ranking_select(struct rank_data rkd[], int rkd_count)
* and reset the "clocks" after each round of
* weighted round-robin selection.
*/
if (max <= 0 && res >= 0) {
if (max < 0 && res >= 0) {
for (i = 0; i < rkd_count; i++)
rkd[i].rkd_got -= rkd[i].rkd_current_points;
rkd[i].rkd_got += max;
}
MARS_IO("res = %d\n", res);
return res;

View File

@ -7,6 +7,8 @@
/* Generic round-robin scheduler based on ranking information.
*/
#define RKI_DUMMY INT_MIN
struct rank_info {
int rki_x;
int rki_y;
@ -41,8 +43,10 @@ struct rank_data {
* Important: the rki[] array describes a ranking function at some
* example points (x_i,y_i) which must be ordered according to x_i
* in ascending order. And, of course, you need to supply at least
* rki_count >= 2 sample points (otherwise a linear function cannot
* two sample points (otherwise a linear function cannot
* be described).
* The array _must_ always end with a dummy record where the x_i has the
* value RKI_DUMMY.
*/
extern inline
@ -54,7 +58,7 @@ void ranking_start(struct rank_data rkd[], int rkd_count)
}
}
extern void ranking_compute(struct rank_data *rkd, const struct rank_info rki[], int rki_count, int x);
extern void ranking_compute(struct rank_data *rkd, const struct rank_info rki[], int x);
/* This may be used to (exceptionally) add some extra salt...
*/
@ -105,9 +109,11 @@ extern int ranking_select(struct rank_data rkd[], int rkd_count);
extern inline
void ranking_select_done(struct rank_data rkd[], int winner, int win_points)
{
if (win_points < 1)
win_points = 1;
rkd[winner].rkd_got += win_points;
if (winner >= 0) {
if (win_points < 1)
win_points = 1;
rkd[winner].rkd_got += win_points;
}
}
#endif

8
mars.h
View File

@ -16,6 +16,7 @@
#include "brick.h"
#include "brick_mem.h"
#include "lib_timing.h"
#define GFP_MARS GFP_BRICK
@ -264,6 +265,13 @@ extern void (*_mars_remote_trigger)(void);
/////////////////////////////////////////////////////////////////////////
/* Some global stuff.
*/
extern struct banning mars_global_ban;
/////////////////////////////////////////////////////////////////////////
/* Some special brick types for avoidance of cyclic references.
*
* The client/server network bricks use this for independent instantiation

View File

@ -18,48 +18,47 @@
#include "mars.h"
#include "lib_timing.h"
#include "mars_aio.h"
#define MARS_MAX_AIO 1024
#define MARS_MAX_AIO_READ 32
#define MEASURE_SYNC 8
static struct timing_stats timings[3] = {};
static struct timing_stats timings[2] = {};
struct threshold aio_submit_threshold = {
.thr_ban = &mars_global_ban,
.thr_limit = AIO_SUBMIT_MAX_LATENCY,
.thr_factor = 10,
.thr_plus = 10000,
};
EXPORT_SYMBOL_GPL(aio_submit_threshold);
struct threshold aio_io_threshold[2] = {
[0] = {
.thr_ban = &mars_global_ban,
.thr_limit = AIO_IO_R_MAX_LATENCY,
.thr_factor = 100,
.thr_plus = 0,
},
[1] = {
.thr_ban = &mars_global_ban,
.thr_limit = AIO_IO_W_MAX_LATENCY,
.thr_factor = 100,
.thr_plus = 0,
},
};
EXPORT_SYMBOL_GPL(aio_io_threshold);
struct threshold aio_sync_threshold = {
.thr_ban = &mars_global_ban,
.thr_limit = AIO_SYNC_MAX_LATENCY,
.thr_factor = 100,
.thr_plus = 0,
};
EXPORT_SYMBOL_GPL(aio_sync_threshold);
///////////////////////// own type definitions ////////////////////////
#include "mars_aio.h"
#ifdef MEASURE_SYNC
static int sync_ticks[MEASURE_SYNC] = {};
static void measure_sync(int ticks)
{
int order = ticks;
if (ticks > 1) {
order = MEASURE_SYNC - 1;
while (order > 0 && (1 << (order-1)) >= ticks) {
order--;
}
order++;
}
sync_ticks[order]++;
}
static char *show_sync(void)
{
char *res = brick_string_alloc(0);
int i;
int pos = 0;
if (!res)
return NULL;
for (i = 0; i < MEASURE_SYNC; i++) {
pos += snprintf(res + pos, 256, "%d: %d ", i, sync_ticks[i]);
}
return res;
}
#endif
////////////////// some helpers //////////////////
static inline
@ -77,6 +76,8 @@ void _enqueue(struct aio_threadinfo *tinfo, struct aio_mref_aspect *mref_a, int
prio = 0;
#endif
mref_a->enqueue_stamp = cpu_clock(raw_smp_processor_id());
traced_lock(&tinfo->lock, flags);
if (at_end) {
@ -84,38 +85,45 @@ void _enqueue(struct aio_threadinfo *tinfo, struct aio_mref_aspect *mref_a, int
} else {
list_add(&mref_a->io_head, &tinfo->mref_list[prio]);
}
tinfo->queued[prio]++;
tinfo->queued_sum++;
traced_unlock(&tinfo->lock, flags);
atomic_inc(&tinfo->total_enqueue_count);
wake_up_interruptible_all(&tinfo->event);
}
static inline
struct aio_mref_aspect *_dequeue(struct aio_threadinfo *tinfo, bool do_remove)
struct aio_mref_aspect *_dequeue(struct aio_threadinfo *tinfo)
{
struct aio_mref_aspect *mref_a = NULL;
int prio;
unsigned long flags = 0;
if (do_remove)
traced_lock(&tinfo->lock, flags);
traced_lock(&tinfo->lock, flags);
for (prio = 0; prio < MARS_PRIO_NR; prio++) {
struct list_head *start = &tinfo->mref_list[prio];
struct list_head *tmp = start->next;
if (tmp != start) {
if (do_remove) {
list_del_init(tmp);
atomic_inc(&tinfo->total_dequeue_count);
}
list_del_init(tmp);
tinfo->queued[prio]--;
tinfo->queued_sum--;
mref_a = container_of(tmp, struct aio_mref_aspect, io_head);
goto done;
}
}
done:
if (do_remove)
traced_unlock(&tinfo->lock, flags);
traced_unlock(&tinfo->lock, flags);
if (likely(mref_a && mref_a->object)) {
unsigned long long latency;
latency = cpu_clock(raw_smp_processor_id()) - mref_a->enqueue_stamp;
threshold_check(&aio_io_threshold[mref_a->object->ref_rw & 1], latency);
}
return mref_a;
}
@ -272,8 +280,6 @@ static void aio_ref_io(struct aio_output *output, struct mref_object *mref)
}
_enqueue(tinfo, mref_a, mref->ref_prio, true);
wake_up_interruptible_all(&tinfo->event);
return;
done:
@ -295,16 +301,20 @@ static int aio_submit(struct aio_output *output, struct aio_mref_aspect *mref_a,
// .aio_reqprio = something(mref->ref_prio) field exists, but not yet implemented in kernelspace :(
};
struct iocb *iocbp = &iocb;
unsigned long long latency;
mars_trace(mref, "aio_submit");
oldfs = get_fs();
set_fs(get_ds());
TIME_STATS(&timings[mref->ref_rw & 1], res = sys_io_submit(output->ctxp, 1, &iocbp));
latency = TIME_STATS(&timings[mref->ref_rw & 1], res = sys_io_submit(output->ctxp, 1, &iocbp));
set_fs(oldfs);
threshold_check(&aio_submit_threshold, latency);
if (res < 0 && res != -EAGAIN)
MARS_ERR("error = %d\n", res);
return res;
}
@ -327,10 +337,12 @@ static int aio_submit_dummy(struct aio_output *output)
}
static
int aio_start_thread(struct aio_output *output, int i, int(*fn)(void*))
int aio_start_thread(
struct aio_output *output,
struct aio_threadinfo *tinfo,
int(*fn)(void*),
char class)
{
static int index = 0;
struct aio_threadinfo *tinfo = &output->tinfo[i];
int j;
for (j = 0; j < MARS_PRIO_NR; j++) {
@ -339,8 +351,9 @@ int aio_start_thread(struct aio_output *output, int i, int(*fn)(void*))
tinfo->output = output;
spin_lock_init(&tinfo->lock);
init_waitqueue_head(&tinfo->event);
init_waitqueue_head(&tinfo->terminate_event);
tinfo->terminated = false;
tinfo->thread = kthread_create(fn, tinfo, "mars_%daio%d", i, index++);
tinfo->thread = kthread_create(fn, tinfo, "mars_aio_%c%d", class, output->index);
if (IS_ERR(tinfo->thread)) {
int err = PTR_ERR(tinfo->thread);
MARS_ERR("cannot create thread\n");
@ -415,7 +428,7 @@ void aio_stop_thread(struct aio_output *output, int i, bool do_submit_dummy)
// wait for termination
MARS_INF("waiting for thread %d ...\n", i);
wait_event_interruptible_timeout(
tinfo->event,
tinfo->terminate_event,
tinfo->terminated,
(60 - i * 2) * HZ);
if (likely(tinfo->terminated)) {
@ -434,29 +447,28 @@ static
int aio_sync(struct file *file)
{
int err;
#ifdef MEASURE_SYNC
long long old_jiffies = jiffies;
#endif
err = filemap_write_and_wait_range(file->f_mapping, 0, LLONG_MAX);
#ifdef MEASURE_SYNC
measure_sync(jiffies - old_jiffies);
#endif
return err;
}
static
void aio_sync_all(struct aio_output *output, struct list_head *tmp_list)
{
unsigned long long latency;
int err;
output->fdsync_active = true;
atomic_inc(&output->total_fdsync_count);
err = aio_sync(output->filp);
latency = TIME_STATS(
&timings[2],
err = aio_sync(output->filp)
);
threshold_check(&aio_sync_threshold, latency);
output->fdsync_active = false;
wake_up_interruptible_all(&output->fdsync_event);
if (err < 0) {
@ -529,7 +541,7 @@ int aio_sync_thread(void *data)
MARS_INF("kthread has started on '%s'.\n", output->brick->brick_path);
//set_user_nice(current, -20);
while (!kthread_should_stop()) {
while (!kthread_should_stop() || tinfo->queued_sum > 0) {
LIST_HEAD(tmp_list);
unsigned long flags;
int i;
@ -539,9 +551,8 @@ int aio_sync_thread(void *data)
wait_event_interruptible_timeout(
tinfo->event,
kthread_should_stop() ||
_dequeue(tinfo, false),
1 * HZ);
tinfo->queued_sum > 0,
HZ / 4);
traced_lock(&tinfo->lock, flags);
for (i = 0; i < MARS_PRIO_NR; i++) {
@ -549,6 +560,8 @@ int aio_sync_thread(void *data)
if (!list_empty(start)) {
// move over the whole list
list_replace_init(start, &tmp_list);
tinfo->queued_sum -= tinfo->queued[i];
tinfo->queued[i] = 0;
break;
}
}
@ -568,7 +581,7 @@ int aio_sync_thread(void *data)
MARS_INF("kthread has stopped.\n");
tinfo->terminated = true;
wake_up_interruptible_all(&tinfo->event);
wake_up_interruptible_all(&tinfo->terminate_event);
return 0;
}
@ -586,17 +599,16 @@ static int aio_event_thread(void *data)
if (!current->mm)
goto err;
err = aio_start_thread(output, 2, aio_sync_thread);
err = aio_start_thread(output, &output->tinfo[2], aio_sync_thread, 'y');
if (unlikely(err < 0))
goto err;
while (!kthread_should_stop()) {
while (!kthread_should_stop() || tinfo->queued_sum > 0) {
mm_segment_t oldfs;
int count;
int bounced;
int i;
struct timespec timeout = {
.tv_sec = 10,
.tv_sec = 1,
};
struct io_event events[MARS_MAX_AIO_READ];
@ -609,7 +621,6 @@ static int aio_event_thread(void *data)
set_fs(oldfs);
//MARS_INF("count = %d\n", count);
bounced = 0;
for (i = 0; i < count; i++) {
struct aio_mref_aspect *mref_a = (void*)events[i].data;
struct mref_object *mref;
@ -631,7 +642,6 @@ static int aio_event_thread(void *data)
if (!output->filp->f_op->aio_fsync) {
mars_trace(mref, "aio_fsync");
_enqueue(other, mref_a, mref->ref_prio, true);
bounced++;
continue;
}
err = aio_submit(output, mref_a, true);
@ -642,8 +652,6 @@ static int aio_event_thread(void *data)
_complete(output, mref, err);
}
if (bounced)
wake_up_interruptible_all(&other->event);
}
err = 0;
@ -655,7 +663,7 @@ static int aio_event_thread(void *data)
unuse_fake_mm();
tinfo->terminated = true;
wake_up_interruptible_all(&tinfo->event);
wake_up_interruptible_all(&tinfo->terminate_event);
return err;
}
@ -759,11 +767,11 @@ static int aio_submit_thread(void *data)
if (unlikely(err < 0))
goto cleanup_mm;
err = aio_start_thread(output, 1, aio_event_thread);
err = aio_start_thread(output, &output->tinfo[1], aio_event_thread, 'e');
if (unlikely(err < 0))
goto cleanup_ctxp;
while (!kthread_should_stop() || atomic_read(&output->read_count) > 0 || atomic_read(&output->write_count) > 0) {
while (!kthread_should_stop() || atomic_read(&output->read_count) + atomic_read(&output->write_count) + tinfo->queued_sum > 0) {
struct aio_mref_aspect *mref_a;
struct mref_object *mref;
int sleeptime;
@ -773,11 +781,10 @@ static int aio_submit_thread(void *data)
wait_event_interruptible_timeout(
tinfo->event,
kthread_should_stop() ||
_dequeue(tinfo, false),
HZ);
tinfo->queued_sum > 0,
HZ / 4);
mref_a = _dequeue(tinfo, true);
mref_a = _dequeue(tinfo);
if (!mref_a) {
continue;
}
@ -803,7 +810,7 @@ static int aio_submit_thread(void *data)
mref_a->start_jiffies = jiffies;
}
if ((long long)jiffies - mref_a->start_jiffies <= mref->ref_timeout) {
if (!_dequeue(tinfo, false)) {
if (!tinfo->queued_sum) {
atomic_inc(&output->total_msleep_count);
brick_msleep(1000 * 4 / HZ);
}
@ -816,9 +823,9 @@ static int aio_submit_thread(void *data)
}
}
sleeptime = 1000 / HZ;
sleeptime = 1;
for (;;) {
/* This is just a test. Don't use it for performance reasons.
/* This is just a test. Don't enable it for performance reasons.
*/
if (output->brick->wait_during_fdsync && mref->ref_rw != READ) {
if (output->fdsync_active) {
@ -826,7 +833,7 @@ static int aio_submit_thread(void *data)
atomic_inc(&output->total_fdsync_wait_count);
__wait_event_interruptible_timeout(
output->fdsync_event,
!output->fdsync_active || kthread_should_stop(),
!output->fdsync_active,
delay);
}
@ -842,7 +849,7 @@ static int aio_submit_thread(void *data)
atomic_inc(&output->total_delay_count);
brick_msleep(sleeptime);
if (sleeptime < 100) {
sleeptime += 1000 / HZ;
sleeptime++;
}
}
err:
@ -876,7 +883,7 @@ cleanup_fd:
done:
MARS_DBG("status = %d\n", err);
tinfo->terminated = true;
wake_up_interruptible_all(&tinfo->event);
wake_up_interruptible_all(&tinfo->terminate_event);
return err;
}
@ -904,12 +911,9 @@ char *aio_statistics(struct aio_brick *brick, int verbose)
if (!res)
return NULL;
#ifdef MEASURE_SYNC
sync = show_sync();
#endif
pos += report_timing(&timings[0], res + pos, 4096 - pos);
pos += report_timing(&timings[1], res + pos, 4096 - pos);
pos += report_timing(&timings[2], res + pos, 4096 - pos);
snprintf(res + pos, 4096 - pos,
"total "
@ -927,10 +931,14 @@ char *aio_statistics(struct aio_brick *brick, int verbose)
"flying reads = %d "
"writes = %d "
"allocs = %d "
"q0 = %d (%d - %d) "
"q1 = %d (%d - %d) "
"q2 = %d (%d - %d) |"
" %s\n",
"q0 = %d "
"q1 = %d "
"q2 = %d "
"| total "
"q0 = %d "
"q1 = %d "
"q2 = %d "
"%s\n",
atomic_read(&output->total_read_count),
atomic_read(&output->total_write_count),
atomic_read(&output->total_alloc_count),
@ -945,12 +953,12 @@ char *aio_statistics(struct aio_brick *brick, int verbose)
atomic_read(&output->read_count),
atomic_read(&output->write_count),
atomic_read(&output->alloc_count),
atomic_read(&output->tinfo[0].total_enqueue_count) - atomic_read(&output->tinfo[0].total_dequeue_count),
atomic_read(&output->tinfo[0].total_enqueue_count), atomic_read(&output->tinfo[0].total_dequeue_count),
atomic_read(&output->tinfo[1].total_enqueue_count) - atomic_read(&output->tinfo[1].total_dequeue_count),
atomic_read(&output->tinfo[1].total_enqueue_count), atomic_read(&output->tinfo[1].total_dequeue_count),
atomic_read(&output->tinfo[2].total_enqueue_count) - atomic_read(&output->tinfo[2].total_dequeue_count),
atomic_read(&output->tinfo[2].total_enqueue_count), atomic_read(&output->tinfo[2].total_dequeue_count),
output->tinfo[0].queued_sum,
output->tinfo[1].queued_sum,
output->tinfo[2].queued_sum,
atomic_read(&output->tinfo[0].total_enqueue_count),
atomic_read(&output->tinfo[1].total_enqueue_count),
atomic_read(&output->tinfo[2].total_enqueue_count),
sync ? sync : "");
if (sync)
@ -975,7 +983,6 @@ void aio_reset_statistics(struct aio_brick *brick)
for (i = 0; i < 3; i++) {
struct aio_threadinfo *tinfo = &output->tinfo[i];
atomic_set(&tinfo->total_enqueue_count, 0);
atomic_set(&tinfo->total_dequeue_count, 0);
}
}
@ -1006,6 +1013,7 @@ static int aio_brick_construct(struct aio_brick *brick)
static int aio_switch(struct aio_brick *brick)
{
static int index;
struct aio_output *output = brick->outputs[0];
const char *path = output->brick->brick_path;
int flags = O_CREAT | O_RDWR | O_LARGEFILE;
@ -1052,7 +1060,8 @@ static int aio_switch(struct aio_brick *brick)
}
#endif
err = aio_start_thread(output, 0, aio_submit_thread);
output->index = ++index;
err = aio_start_thread(output, &output->tinfo[0], aio_submit_thread, 's');
if (err < 0)
goto err;

View File

@ -5,6 +5,15 @@
#include <linux/aio.h>
#include <linux/syscalls.h>
#define AIO_SUBMIT_MAX_LATENCY 1000 // 1 ms
#define AIO_IO_R_MAX_LATENCY 50000 // 50 ms
#define AIO_IO_W_MAX_LATENCY 150000 // 150 ms
#define AIO_SYNC_MAX_LATENCY 150000 // 150 ms
extern struct threshold aio_submit_threshold;
extern struct threshold aio_io_threshold[2];
extern struct threshold aio_sync_threshold;
//#define USE_CLEVER_SYNC // TODO: NYI (should result in better write performance)
#ifdef USE_CLEVER_SYNC
@ -24,6 +33,7 @@ struct aio_mref_aspect {
struct pairing_heap_sync heap_head;
#endif
struct list_head io_head;
unsigned long long enqueue_stamp;
long long start_jiffies;
int resubmit;
int alloc_len;
@ -50,10 +60,12 @@ struct aio_threadinfo {
struct aio_output *output;
struct task_struct *thread;
wait_queue_head_t event;
wait_queue_head_t terminate_event;
spinlock_t lock;
bool terminated;
int queued[MARS_PRIO_NR];
int queued_sum;
atomic_t total_enqueue_count;
atomic_t total_dequeue_count;
bool terminated;
};
struct aio_output {
@ -68,6 +80,7 @@ struct aio_output {
wait_queue_head_t fdsync_event;
bool fdsync_active;
// statistics
int index;
atomic_t total_read_count;
atomic_t total_write_count;
atomic_t total_alloc_count;

View File

@ -19,11 +19,35 @@
#include "mars.h"
#include "lib_timing.h"
#include "mars_bio.h"
static struct timing_stats timings[2] = {};
///////////////////////// own type definitions ////////////////////////
struct threshold bio_submit_threshold = {
.thr_ban = &mars_global_ban,
.thr_limit = BIO_SUBMIT_MAX_LATENCY,
.thr_factor = 100,
.thr_plus = 0,
};
EXPORT_SYMBOL_GPL(bio_submit_threshold);
#include "mars_bio.h"
struct threshold bio_io_threshold[2] = {
[0] = {
.thr_ban = &mars_global_ban,
.thr_limit = BIO_IO_R_MAX_LATENCY,
.thr_factor = 10,
.thr_plus = 10000,
},
[1] = {
.thr_ban = &mars_global_ban,
.thr_limit = BIO_IO_W_MAX_LATENCY,
.thr_factor = 10,
.thr_plus = 10000,
},
};
EXPORT_SYMBOL_GPL(bio_io_threshold);
///////////////////////// own type definitions ////////////////////////
static void bio_ref_put(struct bio_output *output, struct mref_object *mref);
@ -46,10 +70,9 @@ void bio_callback(struct bio *bio, int code)
mref_a->status_code = code;
spin_lock_irqsave(&brick->lock, flags);
if (list_empty(&mref_a->io_head)) {
list_add_tail(&mref_a->io_head, &brick->completed_list);
atomic_inc(&brick->completed_count);
}
list_del(&mref_a->io_head);
list_add_tail(&mref_a->io_head, &brick->completed_list);
atomic_inc(&brick->completed_count);
spin_unlock_irqrestore(&brick->lock, flags);
wake_up_interruptible(&brick->response_event);
@ -303,6 +326,8 @@ void _bio_ref_io(struct bio_output *output, struct mref_object *mref, bool cork)
struct bio_brick *brick = output->brick;
struct bio_mref_aspect *mref_a = bio_mref_get_aspect(output->brick, mref);
struct bio *bio;
unsigned long long latency;
unsigned long flags;
int rw;
int status = -EINVAL;
@ -338,13 +363,23 @@ void _bio_ref_io(struct bio_output *output, struct mref_object *mref, bool cork)
MARS_IO("starting IO rw = %d prio 0 %d fly = %d\n", rw, mref->ref_prio, atomic_read(&brick->fly_count[PRIO_INDEX(mref)]));
mars_trace(mref, "bio_submit");
mref_a->start_stamp = cpu_clock(raw_smp_processor_id());
spin_lock_irqsave(&brick->lock, flags);
list_add_tail(&mref_a->io_head, &brick->submitted_list[rw & 1]);
spin_unlock_irqrestore(&brick->lock, flags);
#ifdef FAKE_IO
bio->bi_end_io(bio, 0);
#else
bio->bi_rw = rw;
TIME_STATS(&timings[rw & 1], submit_bio(rw, bio));
latency = TIME_STATS(
&timings[rw & 1],
submit_bio(rw, bio)
);
#endif
threshold_check(&bio_submit_threshold, latency);
status = 0;
if (unlikely(bio_flagged(bio, BIO_EOPNOTSUPP)))
status = -EOPNOTSUPP;
@ -401,16 +436,30 @@ int bio_response_thread(void *data)
for (;;) {
LIST_HEAD(tmp_list);
unsigned long flags;
int thr_limit;
int sleeptime;
int count;
int i;
thr_limit = bio_io_threshold[0].thr_limit;
if (bio_io_threshold[1].thr_limit < thr_limit)
thr_limit = bio_io_threshold[1].thr_limit;
sleeptime = HZ / 10;
if (thr_limit > 0) {
sleeptime = thr_limit / (1000000 * 2 / HZ);
if (unlikely(sleeptime < 2))
sleeptime = 2;
}
#ifdef IO_DEBUGGING
round++;
MARS_IO("%d sleeping...\n", round);
MARS_IO("%d sleeping %d...\n", round, sleeptime);
#endif
wait_event_interruptible_timeout(
brick->response_event,
atomic_read(&brick->completed_count) > 0,
HZ);
sleeptime);
MARS_IO("%d woken up, completed_count = %d fly_count[0] = %d fly_count[1] = %d fly_count[2] = %d\n",
round,
@ -428,6 +477,7 @@ int bio_response_thread(void *data)
struct list_head *tmp;
struct bio_mref_aspect *mref_a;
struct mref_object *mref;
unsigned long long latency;
int code;
if (list_empty(&tmp_list)) {
@ -439,16 +489,20 @@ int bio_response_thread(void *data)
tmp = tmp_list.next;
list_del_init(tmp);
atomic_dec(&brick->completed_count);
mref_a = container_of(tmp, struct bio_mref_aspect, io_head);
mref = mref_a->object;
latency = cpu_clock(raw_smp_processor_id()) - mref_a->start_stamp;
threshold_check(&bio_io_threshold[mref->ref_rw & 1], latency);
code = mref_a->status_code;
#ifdef IO_DEBUGGING
round++;
MARS_IO("%d completed , status = %d\n", round, code);
#endif
mref = mref_a->object;
mars_trace(mref, "bio_endio");
if (code < 0) {
@ -473,6 +527,26 @@ int bio_response_thread(void *data)
}
bio_ref_put(mref_a->output, mref);
}
/* Try to detect slow requests as early as possible,
* even before they have completed.
*/
for (i = 0; i < 2; i++) {
unsigned long long eldest = 0;
spin_lock_irqsave(&brick->lock, flags);
if (!list_empty(&brick->submitted_list[i])) {
struct bio_mref_aspect *mref_a;
mref_a = container_of(brick->submitted_list[i].next, struct bio_mref_aspect, io_head);
eldest = mref_a->start_stamp;
}
spin_unlock_irqrestore(&brick->lock, flags);
if (eldest) {
threshold_check(&bio_io_threshold[i], cpu_clock(raw_smp_processor_id()) - eldest);
}
}
if (count) {
brick->submitted = true;
wake_up_interruptible(&brick->submit_event);
@ -510,7 +584,7 @@ int bio_submit_thread(void *data)
wait_event_interruptible_timeout(
brick->submit_event,
brick->submitted,
HZ);
HZ / 2);
brick->submitted = false;
@ -719,6 +793,8 @@ static int bio_brick_construct(struct bio_brick *brick)
INIT_LIST_HEAD(&brick->queue_list[0]);
INIT_LIST_HEAD(&brick->queue_list[1]);
INIT_LIST_HEAD(&brick->queue_list[2]);
INIT_LIST_HEAD(&brick->submitted_list[0]);
INIT_LIST_HEAD(&brick->submitted_list[1]);
INIT_LIST_HEAD(&brick->completed_list);
init_waitqueue_head(&brick->submit_event);
init_waitqueue_head(&brick->response_event);

View File

@ -2,6 +2,13 @@
#ifndef MARS_BIO_H
#define MARS_BIO_H
#define BIO_SUBMIT_MAX_LATENCY 250 // 250 us
#define BIO_IO_R_MAX_LATENCY 40000 // 40 ms
#define BIO_IO_W_MAX_LATENCY 100000 // 100 ms
extern struct threshold bio_submit_threshold;
extern struct threshold bio_io_threshold[2];
#include <linux/blkdev.h>
struct bio_mref_aspect {
@ -9,6 +16,7 @@ struct bio_mref_aspect {
struct list_head io_head;
struct bio *bio;
struct bio_output *output;
unsigned long long start_stamp;
int status_code;
int hash_pos;
int alloc_len;
@ -33,6 +41,7 @@ struct bio_brick {
// private
spinlock_t lock;
struct list_head queue_list[MARS_PRIO_NR];
struct list_head submitted_list[2];
struct list_head completed_list;
wait_queue_head_t submit_event;
wait_queue_head_t response_event;

View File

@ -18,6 +18,9 @@
// infrastructure
struct banning mars_global_ban = {};
EXPORT_SYMBOL_GPL(mars_global_ban);
static char *id = NULL;
/* TODO: better use MAC addresses (or motherboard IDs where available).

View File

@ -11,12 +11,12 @@
// variants
#define KEEP_UNIQUE
//#define LATER
#define DELAY_CALLERS // this is _needed_ for production systems
//#define WB_COPY // unnecessary (only costs performance)
//#define LATE_COMPLETE // unnecessary (only costs performance)
//#define EARLY_COMPLETION
//#define OLD_POSCOMPLETE
#define SHORTCUT_1_to_3 // when possible, queue 1 executes phase3_startio() directly without intermediate queueing into queue 3 => may be irritating, but has better performance. NOTICE: when some day the IO scheduling should be different between queue 1 and 3, you MUST disable this in order to distinguish between them!
// commenting this out is dangerous for data integrity! use only for testing!
#define USE_MEMCPY
@ -33,23 +33,79 @@
#include "lib_rank.h"
#include "lib_limiter.h"
#include "mars_trans_logger.h"
#ifdef REPLAY_DEBUGGING
#define MARS_RPL(_fmt, _args...) _MARS_MSG(false, "REPLAY ", _fmt, ##_args)
#else
#define MARS_RPL(_args...) /*empty*/
#endif
#if 0
#define inline noinline
#endif
///////////////////////// global tuning ////////////////////////
int trans_logger_mem_usage; // in KB
EXPORT_SYMBOL_GPL(trans_logger_mem_usage);
struct writeback_group global_writeback = {
.lock = __RW_LOCK_UNLOCKED(global_writeback.lock),
.group_anchor = LIST_HEAD_INIT(global_writeback.group_anchor),
.until_percent = 30,
};
EXPORT_SYMBOL_GPL(global_writeback);
static
void add_to_group(struct writeback_group *gr, struct trans_logger_brick *brick)
{
write_lock(&gr->lock);
list_add_tail(&brick->group_head, &gr->group_anchor);
write_unlock(&gr->lock);
}
static
void remove_from_group(struct writeback_group *gr, struct trans_logger_brick *brick)
{
write_lock(&gr->lock);
list_del_init(&brick->group_head);
gr->leader = NULL;
write_unlock(&gr->lock);
}
static
struct trans_logger_brick *elect_leader(struct writeback_group *gr)
{
struct trans_logger_brick *res = gr->leader;
struct list_head *tmp;
if (res && gr->until_percent >= 0) {
loff_t used = atomic64_read(&res->shadow_mem_used);
if (used > gr->biggest * gr->until_percent / 100)
goto done;
}
read_lock(&gr->lock);
for (tmp = gr->group_anchor.next; tmp != &gr->group_anchor; tmp = tmp->next) {
struct trans_logger_brick *test = container_of(tmp, struct trans_logger_brick, group_head);
loff_t new_used = atomic64_read(&test->shadow_mem_used);
if (!res || new_used > atomic64_read(&res->shadow_mem_used)) {
res = test;
gr->biggest = new_used;
}
}
read_unlock(&gr->lock);
gr->leader = res;
done:
return res;
}
///////////////////////// own type definitions ////////////////////////
#include "mars_trans_logger.h"
#if 1
#define inline noinline
#endif
static inline
int lh_cmp(loff_t *a, loff_t *b)
{
@ -1587,7 +1643,7 @@ bool phase1_startio(struct trans_logger_mref_aspect *orig_mref_a)
qq_inc_flying(&brick->q_phase[1]);
fire_writeback(&wb->w_sub_read_list, false);
} else { // shortcut
#ifdef LATER
#ifndef SHORTCUT_1_to_3
qq_wb_insert(&brick->q_phase[3], wb);
wake_up_interruptible_all(&brick->worker_event);
#else
@ -1817,9 +1873,10 @@ bool phase3_startio(struct writeback_info *wb)
*/
static noinline
int run_mref_queue(struct logger_queue *q, bool (*startio)(struct trans_logger_mref_aspect *sub_mref_a), int max)
int run_mref_queue(struct logger_queue *q, bool (*startio)(struct trans_logger_mref_aspect *sub_mref_a), int max, bool do_limit)
{
struct trans_logger_brick *brick = q->q_brick;
int total_len = 0;
bool found = false;
bool ok;
int res = 0;
@ -1830,6 +1887,9 @@ int run_mref_queue(struct logger_queue *q, bool (*startio)(struct trans_logger_m
if (!mref_a)
goto done;
if (do_limit && likely(mref_a->object))
total_len += mref_a->object->ref_len;
ok = startio(mref_a);
if (unlikely(!ok)) {
qq_mref_pushback(q, mref_a);
@ -1842,6 +1902,7 @@ int run_mref_queue(struct logger_queue *q, bool (*startio)(struct trans_logger_m
done:
if (found) {
mars_limit(&global_writeback.limiter, (total_len - 1) / 1024 + 1);
wake_up_interruptible_all(&brick->worker_event);
}
return res;
@ -1851,6 +1912,7 @@ static noinline
int run_wb_queue(struct logger_queue *q, bool (*startio)(struct writeback_info *wb), int max)
{
struct trans_logger_brick *brick = q->q_brick;
int total_len = 0;
bool found = false;
bool ok;
int res = 0;
@ -1861,6 +1923,8 @@ int run_wb_queue(struct logger_queue *q, bool (*startio)(struct writeback_info *
if (!wb)
goto done;
total_len += wb->w_len;
ok = startio(wb);
if (unlikely(!ok)) {
qq_wb_pushback(q, wb);
@ -1872,6 +1936,7 @@ int run_wb_queue(struct logger_queue *q, bool (*startio)(struct writeback_info *
done:
if (found) {
mars_limit(&global_writeback.limiter, (total_len - 1) / 1024 + 1);
wake_up_interruptible_all(&brick->worker_event);
}
return res;
@ -1890,58 +1955,166 @@ int _congested(struct trans_logger_brick *brick)
|| atomic_read(&brick->q_phase[3].q_flying);
}
static const
struct rank_info rank0[] = {
{ 0, 100 },
{ 100, 200 },
};
static const
struct rank_info rank1[] = {
{ 0, 10 },
{ 100, 20 },
};
static const
struct rank_info rank2[] = {
{ 0, 10 },
{ 100, 20 },
};
static const
struct rank_info rank3[] = {
{ 0, 10 },
{ 100, 20 },
};
/* In general, each individual ranking table may have a different length.
/* Ranking tables.
*/
static const
struct rank_info *ranks[LOGGER_QUEUES] = {
[0] = rank0,
[1] = rank1,
[2] = rank2,
[3] = rank3,
static
struct rank_info float_queue_rank_log[] = {
{ 0, 0 },
{ 1, 100 },
{ 10000, 100 },
{ RKI_DUMMY }
};
static const
int rank_counts[LOGGER_QUEUES] = {
[0] = sizeof(rank0) / sizeof(struct rank_info),
[1] = sizeof(rank1) / sizeof(struct rank_info),
[2] = sizeof(rank2) / sizeof(struct rank_info),
[3] = sizeof(rank3) / sizeof(struct rank_info),
static
struct rank_info float_queue_rank_io[] = {
{ 0, 0 },
{ 1, 1 },
{ 10000, 1 },
{ RKI_DUMMY }
};
static
struct rank_info float_fly_rank_log[] = {
{ 0, 0 },
{ 32, 10 },
{ 10000, 10 },
{ RKI_DUMMY }
};
static
struct rank_info float_fly_rank_io[] = {
{ 0, 0 },
{ 1, 10 },
{ 2, -20 },
{ 10000, -100 },
{ RKI_DUMMY }
};
static
struct rank_info nofloat_queue_rank_log[] = {
{ 0, 0 },
{ 1, 100 },
{ 100, 10 },
{ 10000, 10 },
{ RKI_DUMMY }
};
static
struct rank_info nofloat_queue_rank_io[] = {
{ 0, 0 },
{ 1, 10 },
{ 100, 100 },
{ 10000, 200 },
{ RKI_DUMMY }
};
static
struct rank_info nofloat_fly_rank_log[] = {
{ 0, 0 },
{ 1, 1 },
{ 32, 10 },
{ 10000, 1 },
{ RKI_DUMMY }
};
static
struct rank_info nofloat_fly_rank_io[] = {
{ 0, 0 },
{ 1, 10 },
{ 128, 8 },
{ 129, -100 },
{ 10000, -200 },
{ RKI_DUMMY }
};
static
struct rank_info *queue_ranks[2][LOGGER_QUEUES] = {
[0] = {
[0] = float_queue_rank_log,
[1] = float_queue_rank_io,
[2] = float_queue_rank_io,
[3] = float_queue_rank_io,
},
[1] = {
[0] = nofloat_queue_rank_log,
[1] = nofloat_queue_rank_io,
[2] = nofloat_queue_rank_io,
[3] = nofloat_queue_rank_io,
},
};
static
struct rank_info *fly_ranks[2][LOGGER_QUEUES] = {
[0] = {
[0] = float_fly_rank_log,
[1] = float_fly_rank_io,
[2] = float_fly_rank_io,
[3] = float_fly_rank_io,
},
[1] = {
[0] = nofloat_fly_rank_log,
[1] = nofloat_fly_rank_io,
[2] = nofloat_fly_rank_io,
[3] = nofloat_fly_rank_io,
},
};
static noinline
int _do_ranking(struct trans_logger_brick *brick, struct rank_data rkd[])
{
int i;
#ifdef DELAY_CALLERS
int floating_mode;
bool delay_callers;
#endif
ranking_start(rkd, LOGGER_QUEUES);
// check the memory situation...
delay_callers = false;
floating_mode = 1;
if (brick_global_memlimit >= 1024) {
struct rank_info full_punish_global[] = {
{ 0, 0 },
{ brick_global_memlimit * 3 / 4, 0 },
{ brick_global_memlimit, -1000 },
{ RKI_DUMMY }
};
int global_mem_used = atomic64_read(&global_mshadow_used) / 1024;
trans_logger_mem_usage = global_mem_used;
floating_mode = (global_mem_used < brick_global_memlimit / 2) ? 0 : 1;
if (global_mem_used >= brick_global_memlimit)
delay_callers = true;
MARS_IO("global_mem_used = %d\n", global_mem_used);
ranking_compute(&rkd[0], full_punish_global, global_mem_used);
} else if (brick->shadow_mem_limit >= 8) {
struct rank_info full_punish_local[] = {
{ 0, 0 },
{ brick->shadow_mem_limit * 3 / 4, 0 },
{ brick->shadow_mem_limit, -1000 },
{ RKI_DUMMY }
};
int local_mem_used = atomic64_read(&brick->shadow_mem_used) / 1024;
floating_mode = (local_mem_used < brick->shadow_mem_limit / 2) ? 0 : 1;
if (local_mem_used >= brick->shadow_mem_limit)
delay_callers = true;
MARS_IO("local_mem_used = %d\n", local_mem_used);
ranking_compute(&rkd[0], full_punish_local, local_mem_used);
}
if (delay_callers) {
if (!brick->delay_callers) {
brick->delay_callers = true;
atomic_inc(&brick->total_delay_count);
}
} else {
brick->delay_callers = false;
}
// obey the basic rules...
for (i = 0; i < LOGGER_QUEUES; i++) {
int queued = atomic_read(&brick->q_phase[i].q_queued);
@ -1949,62 +2122,44 @@ int _do_ranking(struct trans_logger_brick *brick, struct rank_data rkd[])
MARS_IO("i = %d queued = %d\n", i, queued);
/* This must come first.
* When a queue is empty, you must not credit any positive points.
* Otherwise, (almost) infinite selection of untreatable
* queues may occur.
*/
if (queued <= 0)
continue;
flying = atomic_read(&brick->q_phase[i].q_flying);
if (flying >= brick->q_phase[i].q_max_flying && brick->q_phase[i].q_max_flying > 0)
continue;
if (i == 1 && !floating_mode) {
int lim;
MARS_IO("i = %d queued = %d\n", i, queued);
ranking_compute(&rkd[i], ranks[i], rank_counts[i], queued);
if (atomic_read(&brick->q_phase[0].q_queued) + atomic_read(&brick->q_phase[0].q_flying) > 0) {
break;
}
// ... and the contention rule for queue 0 ...
if (i == 0 && brick->q_phase[0].q_max_flying > 0) {
struct rank_info contention[] = {
{ 0, 0 },
{ brick->q_phase[0].q_max_flying, 300 },
};
MARS_IO("flying = %d\n", flying);
ranking_compute(&rkd[0], contention, 2, flying);
if (elect_leader(&global_writeback) != brick) {
break;
}
if (banning_is_hit(&mars_global_ban)) {
break;
}
lim = mars_limit(&global_writeback.limiter, 0);
if (lim > 0) {
break;
}
}
ranking_compute(&rkd[i], queue_ranks[floating_mode][i], queued);
flying = atomic_read(&brick->q_phase[i].q_flying);
MARS_IO("i = %d queued = %d flying = %d\n", i, queued, flying);
ranking_compute(&rkd[i], fly_ranks[floating_mode][i], flying);
}
// ... and now the exceptions from the rules ...
#ifdef DELAY_CALLERS
delay_callers = false;
if (brick_global_memlimit >= 1024) {
struct rank_info full_punish_global[] = {
{ 0, 0 },
{ brick_global_memlimit * 7 / 8, 0 },
{ brick_global_memlimit, -1000 },
};
int global_mem_used = atomic64_read(&global_mshadow_used) / 1024;
trans_logger_mem_usage = global_mem_used;
if (global_mem_used >= brick_global_memlimit)
delay_callers = true;
MARS_IO("global_mem_used = %d\n", global_mem_used);
ranking_compute(&rkd[0], full_punish_global, 3, global_mem_used);
} else if (brick->shadow_mem_limit >= 8) {
struct rank_info full_punish_local[] = {
{ 0, 0 },
{ brick->shadow_mem_limit * 7 / 8, 0 },
{ brick->shadow_mem_limit, -1000 },
};
int local_mem_used = atomic64_read(&brick->shadow_mem_used) / 1024;
if (local_mem_used >= brick->shadow_mem_limit)
delay_callers = true;
MARS_IO("local_mem_used = %d\n", local_mem_used);
ranking_compute(&rkd[0], full_punish_local, 3, local_mem_used);
}
brick->delay_callers = delay_callers;
#endif
// finalize it
ranking_stop(rkd, LOGGER_QUEUES);
@ -2137,10 +2292,10 @@ void trans_logger_log(struct trans_logger_brick *brick)
switch (winner) {
case 0:
nr = run_mref_queue(&brick->q_phase[0], prep_phase_startio, brick->q_phase[0].q_batchlen);
nr = run_mref_queue(&brick->q_phase[0], prep_phase_startio, brick->q_phase[0].q_batchlen, true);
goto done;
case 1:
nr = run_mref_queue(&brick->q_phase[1], phase1_startio, brick->q_phase[1].q_batchlen);
nr = run_mref_queue(&brick->q_phase[1], phase1_startio, brick->q_phase[1].q_batchlen, true);
goto done;
case 2:
nr = run_wb_queue(&brick->q_phase[2], phase2_startio, brick->q_phase[2].q_batchlen);
@ -2711,6 +2866,7 @@ int trans_logger_brick_construct(struct trans_logger_brick *brick)
atomic_set(&brick->hash_count, 0);
spin_lock_init(&brick->replay_lock);
INIT_LIST_HEAD(&brick->replay_list);
INIT_LIST_HEAD(&brick->group_head);
init_waitqueue_head(&brick->worker_event);
init_waitqueue_head(&brick->caller_event);
qq_init(&brick->q_phase[0], brick);
@ -2732,6 +2888,20 @@ int trans_logger_brick_construct(struct trans_logger_brick *brick)
brick->new_input_nr = TL_INPUT_LOG1;
brick->log_input_nr = TL_INPUT_LOG1;
brick->old_input_nr = TL_INPUT_LOG1;
add_to_group(&global_writeback, brick);
return 0;
}
static noinline
int trans_logger_brick_destruct(struct trans_logger_brick *brick)
{
int i;
for (i = 0; i < TRANS_HASH_MAX; i++) {
struct hash_anchor *start = &brick->hash_table[i];
CHECK_HEAD_EMPTY(&start->hash_anchor);
}
CHECK_HEAD_EMPTY(&brick->replay_list);
remove_from_group(&global_writeback, brick);
return 0;
}
@ -2810,6 +2980,7 @@ const struct trans_logger_brick_type trans_logger_brick_type = {
.default_input_types = trans_logger_input_types,
.default_output_types = trans_logger_output_types,
.brick_construct = &trans_logger_brick_construct,
.brick_destruct = &trans_logger_brick_destruct,
};
EXPORT_SYMBOL_GPL(trans_logger_brick_type);

View File

@ -10,12 +10,27 @@
#include <linux/time.h>
#include "mars.h"
#include "lib_log.h"
#include "lib_pairing_heap.h"
#include "lib_queue.h"
///////////////////////// global tuning ////////////////////////
extern int trans_logger_mem_usage; // in KB
struct writeback_group {
rwlock_t lock;
struct trans_logger_brick *leader;
loff_t biggest;
struct list_head group_anchor;
// tuning
struct mars_limiter limiter;
int until_percent;
};
extern struct writeback_group global_writeback;
////////////////////////////////////////////////////////////////////
_PAIRING_HEAP_TYPEDEF(logger,)
@ -133,6 +148,7 @@ struct trans_logger_brick {
int old_input_nr; // where old IO requests may be on the fly
int replay_code; // replay errors (if any)
// private
struct list_head group_head;
loff_t old_margin;
spinlock_t replay_lock;
struct list_head replay_list;

View File

@ -120,7 +120,7 @@ struct mars_rotate {
// TUNING
int mars_mem_percent = 0;
int mars_mem_percent = 20;
EXPORT_SYMBOL_GPL(mars_mem_percent);
#define CONF_TRANS_SHADOW_LIMIT (1024 * 128) // don't fill the hashtable too much
@ -132,15 +132,13 @@ EXPORT_SYMBOL_GPL(mars_mem_percent);
//#define TRANS_FAKE
#define CONF_TRANS_BATCHLEN 64
#define CONF_TRANS_FLYING 256
#define CONF_TRANS_PRIO MARS_PRIO_HIGH
#define CONF_TRANS_LOG_READS false
//#define CONF_TRANS_LOG_READS true
//#define CONF_TRANS_COMPLETION_SEMANTICS 2
#define CONF_TRANS_COMPLETION_SEMANTICS 0
#define CONF_ALL_BATCHLEN 4
#define CONF_ALL_FLYING 32
#define CONF_ALL_BATCHLEN 1
#define CONF_ALL_PRIO MARS_PRIO_NORMAL
#define IF_SKIP_SYNC true
@ -186,11 +184,6 @@ int _set_trans_params(struct mars_brick *_brick, void *private)
trans_brick->q_phase[2].q_batchlen = CONF_ALL_BATCHLEN;
trans_brick->q_phase[3].q_batchlen = CONF_ALL_BATCHLEN;
trans_brick->q_phase[0].q_max_flying = CONF_TRANS_FLYING;
trans_brick->q_phase[1].q_max_flying = CONF_ALL_FLYING;
trans_brick->q_phase[2].q_max_flying = CONF_ALL_FLYING;
trans_brick->q_phase[3].q_max_flying = CONF_ALL_FLYING;
trans_brick->q_phase[0].q_io_prio = CONF_TRANS_PRIO;
trans_brick->q_phase[1].q_io_prio = CONF_ALL_PRIO;
trans_brick->q_phase[2].q_io_prio = CONF_ALL_PRIO;
@ -3857,7 +3850,7 @@ static int light_thread(void *data)
mars_mem_percent = 0;
if (mars_mem_percent > 70)
mars_mem_percent = 70;
brick_global_memlimit = brick_global_memavail * mars_mem_percent / 100;
brick_global_memlimit = (long long)brick_global_memavail * mars_mem_percent / 100;
brick_msleep(100);
@ -3918,6 +3911,8 @@ static int light_thread(void *data)
_show_statist(&_global);
#endif
MARS_DBG("ban_count = %d ban_renew_count = %d\n", mars_global_ban.ban_count, mars_global_ban.ban_renew_count);
brick_msleep(500);
wait_event_interruptible_timeout(_global.main_event, _global.main_trigger, CONFIG_MARS_SCAN_INTERVAL * HZ);

View File

@ -13,6 +13,8 @@
#include "strategy.h"
#include "mars_proc.h"
#include "../mars_bio.h"
#include "../mars_aio.h"
#include "../mars_client.h"
#include "../mars_server.h"
#include "../mars_trans_logger.h"
@ -161,6 +163,44 @@ EXPORT_SYMBOL_GPL(mars_max_loadavg);
#define _CTL_STRATEGY(handler) /*empty*/
#endif
#define INT_ENTRY(NAME,VAR,MODE) \
{ \
_CTL_NAME \
.procname = NAME, \
.data = &(VAR), \
.maxlen = sizeof(int), \
.mode = MODE, \
.proc_handler = &proc_dointvec, \
_CTL_STRATEGY(sysctl_intvec) \
}
#define LIMITER_ENTRIES(VAR, PREFIX, SUFFIX) \
INT_ENTRY(PREFIX "_limit_" SUFFIX, (VAR)->lim_max_rate, 0600), \
INT_ENTRY(PREFIX "_rate_" SUFFIX, (VAR)->lim_rate, 0600) \
#define THRESHOLD_ENTRIES(VAR, PREFIX) \
INT_ENTRY(PREFIX "_threshold_us", (VAR)->thr_limit, 0600), \
INT_ENTRY(PREFIX "_factor_percent", (VAR)->thr_factor, 0600), \
INT_ENTRY(PREFIX "_plus_us", (VAR)->thr_plus, 0600), \
INT_ENTRY(PREFIX "_triggered", (VAR)->thr_triggered, 0600), \
INT_ENTRY(PREFIX "_true_hit", (VAR)->thr_true_hit, 0600) \
static
ctl_table tuning_table[] = {
LIMITER_ENTRIES(&client_limiter, "network_traffic", "kb"),
LIMITER_ENTRIES(&server_limiter, "server_io", "kb"),
LIMITER_ENTRIES(&global_writeback.limiter, "writeback", "kb"),
INT_ENTRY("writeback_until_percent", global_writeback.until_percent, 0600),
THRESHOLD_ENTRIES(&bio_submit_threshold, "bio_submit"),
THRESHOLD_ENTRIES(&bio_io_threshold[0], "bio_io_r"),
THRESHOLD_ENTRIES(&bio_io_threshold[1], "bio_io_w"),
THRESHOLD_ENTRIES(&aio_submit_threshold, "aio_submit"),
THRESHOLD_ENTRIES(&aio_io_threshold[0], "aio_io_r"),
THRESHOLD_ENTRIES(&aio_io_threshold[1], "aio_io_w"),
THRESHOLD_ENTRIES(&aio_sync_threshold, "aio_sync"),
{}
};
static
ctl_table mars_table[] = {
{
@ -181,106 +221,20 @@ ctl_table mars_table[] = {
.mode = 0400,
.proc_handler = &errors_sysctl_handler,
},
{
_CTL_NAME
.procname = "percent_mem_limit_kb",
.data = &mars_mem_percent,
.maxlen = sizeof(int),
.mode = 0600,
.proc_handler = &proc_dointvec,
_CTL_STRATEGY(sysctl_intvec)
},
{
_CTL_NAME
.procname = "mem_used_kb",
.data = &trans_logger_mem_usage,
.maxlen = sizeof(int),
.mode = 0400,
.proc_handler = &proc_dointvec,
_CTL_STRATEGY(sysctl_intvec)
},
{
_CTL_NAME
.procname = "logrot_auto_gb",
.data = &global_logrot_auto,
.maxlen = sizeof(int),
.mode = 0600,
.proc_handler = &proc_dointvec,
_CTL_STRATEGY(sysctl_intvec)
},
{
_CTL_NAME
.procname = "logdel_auto_gb",
.data = &global_logdel_auto,
.maxlen = sizeof(int),
.mode = 0600,
.proc_handler = &proc_dointvec,
_CTL_STRATEGY(sysctl_intvec)
},
{
_CTL_NAME
.procname = "free_space_mb",
.data = &global_free_space,
.maxlen = sizeof(int),
.mode = 0600,
.proc_handler = &proc_dointvec,
_CTL_STRATEGY(sysctl_intvec)
},
INT_ENTRY("percent_mem_limit_kb", mars_mem_percent, 0600),
INT_ENTRY("mem_used_kb", trans_logger_mem_usage, 0400),
INT_ENTRY("logrot_auto_gb", global_logrot_auto, 0600),
INT_ENTRY("logdel_auto_gb", global_logdel_auto, 0600),
INT_ENTRY("free_space_mb", global_free_space, 0600),
#ifdef CONFIG_MARS_LOADAVG_LIMIT
{
_CTL_NAME
.procname = "loadavg_limit",
.data = &mars_max_loadavg,
.maxlen = sizeof(int),
.mode = 0600,
.proc_handler = &proc_dointvec,
_CTL_STRATEGY(sysctl_intvec)
},
INT_ENTRY("loadavg_limit", mars_max_loadavg, 0600),
#endif
INT_ENTRY("network_io_timeout", global_net_io_timeout, 0600),
{
_CTL_NAME
.procname = "network_io_timeout",
.data = &global_net_io_timeout,
.maxlen = sizeof(int),
.mode = 0600,
.proc_handler = &proc_dointvec,
_CTL_STRATEGY(sysctl_intvec)
},
{
_CTL_NAME
.procname = "network_traffic_limit_kb",
.data = &client_limiter.lim_max_rate,
.maxlen = sizeof(int),
.mode = 0600,
.proc_handler = &proc_dointvec,
_CTL_STRATEGY(sysctl_intvec)
},
{
_CTL_NAME
.procname = "network_traffic_rate_kb",
.data = &client_limiter.lim_rate,
.maxlen = sizeof(int),
.mode = 0400,
.proc_handler = &proc_dointvec,
_CTL_STRATEGY(sysctl_intvec)
},
{
_CTL_NAME
.procname = "server_io_limit_mb",
.data = &server_limiter.lim_max_rate,
.maxlen = sizeof(int),
.mode = 0600,
.proc_handler = &proc_dointvec,
_CTL_STRATEGY(sysctl_intvec)
},
{
_CTL_NAME
.procname = "server_io_rate_mb",
.data = &server_limiter.lim_rate,
.maxlen = sizeof(int),
.mode = 0400,
.proc_handler = &proc_dointvec,
_CTL_STRATEGY(sysctl_intvec)
.procname = "tuning",
.mode = 0500,
.child = tuning_table,
},
{}
};