all: IO scheduling improvements, tuning

Dastically boost random write performance on RAID controllers with BBUs. Writeback is only performed when there is no IO contention / starvation. The old IO contention controller was suited to workstations. Now server loads are well-controlled even when BBUs are present.
2012-10-15 16:35:36 +02:00 · 2012-10-15 16:35:36 +02:00 · c319230fa1
parent d510dd1a3b
commit c319230fa1
13 changed files with 596 additions and 338 deletions
--- a/lib_queue.h
+++ b/lib_queue.h
@ -11,7 +11,6 @@
 	atomic_t q_total;						\
 	/* tunables */							\
 	int q_batchlen;							\
-	int q_max_flying;						\
 	int q_io_prio;							\
 	bool q_ordering;						\
 	/* private */							\
--- a/lib_rank.c
+++ b/lib_rank.c
@ -11,28 +11,27 @@
 #include "mars.h"
 #include "lib_rank.h"

-void ranking_compute(struct rank_data *rkd, const struct rank_info rki[], int rki_count, int x)
+void ranking_compute(struct rank_data *rkd, const struct rank_info rki[], int x)
 {
 	int i;

-	MARS_IO("rki_count = %d at x = %d\n", rki_count, x);
-
-	BUG_ON(rki_count < 2);
-
-	rki_count--;
-	for (i = 0; i < rki_count; i++) {
+	for (i = 0; ; i++) {
 		int x0 = rki[i].rki_x;
 		int x1;
 		int y0;
 		int y1;
 		int points;
+		
+		if (x0 == RKI_DUMMY)
+			break;

-		if (x < x0 && i+1 < rki_count)
+		if (x < x0)
 			continue;

 		x1 = rki[i+1].rki_x;

-		BUG_ON(x1 == x0);
+		if (x1 == RKI_DUMMY)
+			break;
 		
 		y0 = rki[i].rki_y;
 		y1 = rki[i+1].rki_y;
@ -59,7 +58,7 @@ int ranking_select(struct rank_data rkd[], int rkd_count)
 		int rest = tmp->rkd_current_points;
 		if (rest <= 0)
 			continue;
-		rest -= tmp->rkd_got;
+		//rest -= tmp->rkd_got;
 		if (rest > max) {
 			max = rest;
 			res = i;
@ -69,9 +68,9 @@ int ranking_select(struct rank_data rkd[], int rkd_count)
 	 * and reset the "clocks" after each round of
 	 * weighted round-robin selection.
 	 */
-	if (max <= 0 && res >= 0) {
+	if (max < 0 && res >= 0) {
 		for (i = 0; i < rkd_count; i++)
-			rkd[i].rkd_got -= rkd[i].rkd_current_points;
+			rkd[i].rkd_got += max;
 	}
 	MARS_IO("res = %d\n", res);
 	return res;
--- a/lib_rank.h
+++ b/lib_rank.h
@ -7,6 +7,8 @@
 /* Generic round-robin scheduler based on ranking information.
 */

+#define RKI_DUMMY INT_MIN
+
 struct rank_info {
 	int rki_x;
 	int rki_y;
@ -41,8 +43,10 @@ struct rank_data {
 * Important: the rki[] array describes a ranking function at some
 * example points (x_i,y_i) which must be ordered according to x_i
 * in ascending order. And, of course, you need to supply at least
- * rki_count >= 2 sample points (otherwise a linear function cannot
+ * two sample points (otherwise a linear function cannot
 * be described).
+ * The array _must_ always end with a dummy record where the x_i has the
+ * value RKI_DUMMY.
 */

 extern inline
@ -54,7 +58,7 @@ void ranking_start(struct rank_data rkd[], int rkd_count)
 	}
 }

-extern void ranking_compute(struct rank_data *rkd, const struct rank_info rki[], int rki_count, int x);
+extern void ranking_compute(struct rank_data *rkd, const struct rank_info rki[], int x);

 /* This may be used to (exceptionally) add some extra salt...
 */
@ -105,9 +109,11 @@ extern int ranking_select(struct rank_data rkd[], int rkd_count);
 extern inline
 void ranking_select_done(struct rank_data rkd[], int winner, int win_points)
 {
-	if (win_points < 1)
-		win_points = 1;
-	rkd[winner].rkd_got += win_points;
+	if (winner >= 0) {
+		if (win_points < 1)
+			win_points = 1;
+		rkd[winner].rkd_got += win_points;
+	}
 }

 #endif
--- a/mars.h
+++ b/mars.h
@ -16,6 +16,7 @@

 #include "brick.h"
 #include "brick_mem.h"
+#include "lib_timing.h"

 #define GFP_MARS GFP_BRICK

@ -264,6 +265,13 @@ extern void (*_mars_remote_trigger)(void);

 /////////////////////////////////////////////////////////////////////////

+/* Some global stuff.
+ */
+
+extern struct banning mars_global_ban;
+
+/////////////////////////////////////////////////////////////////////////
+
 /* Some special brick types for avoidance of cyclic references.
 *
 * The client/server network bricks use this for independent instantiation
--- a/mars_aio.c
+++ b/mars_aio.c
@ -18,48 +18,47 @@
 #include "mars.h"
 #include "lib_timing.h"

+#include "mars_aio.h"
+
 #define MARS_MAX_AIO      1024
 #define MARS_MAX_AIO_READ 32

-#define MEASURE_SYNC 8
+static struct timing_stats timings[3] = {};

-static struct timing_stats timings[2] = {};
+struct threshold aio_submit_threshold = {
+	.thr_ban = &mars_global_ban,
+	.thr_limit = AIO_SUBMIT_MAX_LATENCY,
+	.thr_factor = 10,
+	.thr_plus = 10000,
+};
+EXPORT_SYMBOL_GPL(aio_submit_threshold);
+
+struct threshold aio_io_threshold[2] = {
+	[0] = {
+		.thr_ban = &mars_global_ban,
+		.thr_limit = AIO_IO_R_MAX_LATENCY,
+		.thr_factor = 100,
+		.thr_plus = 0,
+	},
+	[1] = {
+		.thr_ban = &mars_global_ban,
+		.thr_limit = AIO_IO_W_MAX_LATENCY,
+		.thr_factor = 100,
+		.thr_plus = 0,
+	},
+};
+EXPORT_SYMBOL_GPL(aio_io_threshold);
+
+struct threshold aio_sync_threshold = {
+	.thr_ban = &mars_global_ban,
+	.thr_limit = AIO_SYNC_MAX_LATENCY,
+	.thr_factor = 100,
+	.thr_plus = 0,
+};
+EXPORT_SYMBOL_GPL(aio_sync_threshold);

 ///////////////////////// own type definitions ////////////////////////

-#include "mars_aio.h"
-
-#ifdef MEASURE_SYNC
-static int sync_ticks[MEASURE_SYNC] = {};
-
-static void measure_sync(int ticks)
-{
-	int order = ticks;
-	if (ticks > 1) {
-		order = MEASURE_SYNC - 1;
-		while (order > 0 && (1 << (order-1)) >= ticks) {
-			order--;
-		}
-		order++;
-	}
-	sync_ticks[order]++;
-}
-
-static char *show_sync(void)
-{
-	char *res = brick_string_alloc(0);
-	int i;
-	int pos = 0;
-	if (!res)
-		return NULL;
-	for (i = 0; i < MEASURE_SYNC; i++) {
-		pos += snprintf(res + pos, 256, "%d: %d ", i, sync_ticks[i]);
-	}
-	return res;
-}
-
-#endif
-
 ////////////////// some helpers //////////////////

 static inline
@ -77,6 +76,8 @@ void _enqueue(struct aio_threadinfo *tinfo, struct aio_mref_aspect *mref_a, int
 	prio = 0;
 #endif

+	mref_a->enqueue_stamp = cpu_clock(raw_smp_processor_id());
+
 	traced_lock(&tinfo->lock, flags);

 	if (at_end) {
@ -84,38 +85,45 @@ void _enqueue(struct aio_threadinfo *tinfo, struct aio_mref_aspect *mref_a, int
 	} else {
 		list_add(&mref_a->io_head, &tinfo->mref_list[prio]);
 	}
+	tinfo->queued[prio]++;
+	tinfo->queued_sum++;

 	traced_unlock(&tinfo->lock, flags);

 	atomic_inc(&tinfo->total_enqueue_count);
+
+	wake_up_interruptible_all(&tinfo->event);
 }

 static inline
-struct aio_mref_aspect *_dequeue(struct aio_threadinfo *tinfo, bool do_remove)
+struct aio_mref_aspect *_dequeue(struct aio_threadinfo *tinfo)
 {
 	struct aio_mref_aspect *mref_a = NULL;
 	int prio;
 	unsigned long flags = 0;

-	if (do_remove)
-		traced_lock(&tinfo->lock, flags);
+	traced_lock(&tinfo->lock, flags);

 	for (prio = 0; prio < MARS_PRIO_NR; prio++) {
 		struct list_head *start = &tinfo->mref_list[prio];
 		struct list_head *tmp = start->next;
 		if (tmp != start) {
-			if (do_remove) {
-				list_del_init(tmp);
-				atomic_inc(&tinfo->total_dequeue_count);
-			}
+			list_del_init(tmp);
+			tinfo->queued[prio]--;
+			tinfo->queued_sum--;
 			mref_a = container_of(tmp, struct aio_mref_aspect, io_head);
 			goto done;
 		}
 	}

 done:
-	if (do_remove)
-		traced_unlock(&tinfo->lock, flags);
+	traced_unlock(&tinfo->lock, flags);
+
+	if (likely(mref_a && mref_a->object)) {
+		unsigned long long latency;
+		latency = cpu_clock(raw_smp_processor_id()) - mref_a->enqueue_stamp;
+		threshold_check(&aio_io_threshold[mref_a->object->ref_rw & 1], latency);
+	}
 	return mref_a;
 }

@ -272,8 +280,6 @@ static void aio_ref_io(struct aio_output *output, struct mref_object *mref)
 	}

 	_enqueue(tinfo, mref_a, mref->ref_prio, true);
-
-	wake_up_interruptible_all(&tinfo->event);
 	return;

 done:
@ -295,16 +301,20 @@ static int aio_submit(struct aio_output *output, struct aio_mref_aspect *mref_a,
 		// .aio_reqprio = something(mref->ref_prio) field exists, but not yet implemented in kernelspace :(
 	};
 	struct iocb *iocbp = &iocb;
+	unsigned long long latency;

 	mars_trace(mref, "aio_submit");

 	oldfs = get_fs();
 	set_fs(get_ds());
-	TIME_STATS(&timings[mref->ref_rw & 1], res = sys_io_submit(output->ctxp, 1, &iocbp));
+	latency = TIME_STATS(&timings[mref->ref_rw & 1], res = sys_io_submit(output->ctxp, 1, &iocbp));
 	set_fs(oldfs);

+	threshold_check(&aio_submit_threshold, latency);
+
 	if (res < 0 && res != -EAGAIN)
 		MARS_ERR("error = %d\n", res);
+
 	return res;
 }

@ -327,10 +337,12 @@ static int aio_submit_dummy(struct aio_output *output)
 }

 static
-int aio_start_thread(struct aio_output *output, int i, int(*fn)(void*))
+int aio_start_thread(
+	struct aio_output *output,
+	struct aio_threadinfo *tinfo,
+	int(*fn)(void*),
+	char class)
 {
-	static int index = 0;
-	struct aio_threadinfo *tinfo = &output->tinfo[i];
 	int j;

 	for (j = 0; j < MARS_PRIO_NR; j++) {
@ -339,8 +351,9 @@ int aio_start_thread(struct aio_output *output, int i, int(*fn)(void*))
 	tinfo->output = output;
 	spin_lock_init(&tinfo->lock);
 	init_waitqueue_head(&tinfo->event);
+	init_waitqueue_head(&tinfo->terminate_event);
 	tinfo->terminated = false;
-	tinfo->thread = kthread_create(fn, tinfo, "mars_%daio%d", i, index++);
+	tinfo->thread = kthread_create(fn, tinfo, "mars_aio_%c%d", class, output->index);
 	if (IS_ERR(tinfo->thread)) {
 		int err = PTR_ERR(tinfo->thread);
 		MARS_ERR("cannot create thread\n");
@ -415,7 +428,7 @@ void aio_stop_thread(struct aio_output *output, int i, bool do_submit_dummy)
 		// wait for termination
 		MARS_INF("waiting for thread %d ...\n", i);
 		wait_event_interruptible_timeout(
-			tinfo->event,
+			tinfo->terminate_event,
 			tinfo->terminated,
 			(60 - i * 2) * HZ);
 		if (likely(tinfo->terminated)) {
@ -434,29 +447,28 @@ static
 int aio_sync(struct file *file)
 {
 	int err;
-#ifdef MEASURE_SYNC
-	long long old_jiffies = jiffies;
-#endif

 	err = filemap_write_and_wait_range(file->f_mapping, 0, LLONG_MAX);

-
-#ifdef MEASURE_SYNC
-	measure_sync(jiffies - old_jiffies);
-#endif
 	return err;
 }

 static
 void aio_sync_all(struct aio_output *output, struct list_head *tmp_list)
 {
+	unsigned long long latency;
 	int err;

 	output->fdsync_active = true;
 	atomic_inc(&output->total_fdsync_count);
 	
-	err = aio_sync(output->filp);
+	latency = TIME_STATS(
+		&timings[2],
+		err = aio_sync(output->filp)
+		);
 	
+	threshold_check(&aio_sync_threshold, latency);
+
 	output->fdsync_active = false;
 	wake_up_interruptible_all(&output->fdsync_event);
 	if (err < 0) {
@ -529,7 +541,7 @@ int aio_sync_thread(void *data)
 	MARS_INF("kthread has started on '%s'.\n", output->brick->brick_path);
 	//set_user_nice(current, -20);

-	while (!kthread_should_stop()) {
+	while (!kthread_should_stop() || tinfo->queued_sum > 0) {
 		LIST_HEAD(tmp_list);
 		unsigned long flags;
 		int i;
@ -539,9 +551,8 @@ int aio_sync_thread(void *data)

 		wait_event_interruptible_timeout(
 			tinfo->event,
-			kthread_should_stop() ||
-			_dequeue(tinfo, false),
-			1 * HZ);
+			tinfo->queued_sum > 0,
+			HZ / 4);

 		traced_lock(&tinfo->lock, flags);
 		for (i = 0; i < MARS_PRIO_NR; i++) {
@ -549,6 +560,8 @@ int aio_sync_thread(void *data)
 			if (!list_empty(start)) {
 				// move over the whole list
 				list_replace_init(start, &tmp_list);
+				tinfo->queued_sum -= tinfo->queued[i];
+				tinfo->queued[i] = 0;
 				break;
 			}
 		}
@ -568,7 +581,7 @@ int aio_sync_thread(void *data)

 	MARS_INF("kthread has stopped.\n");
 	tinfo->terminated = true;
-	wake_up_interruptible_all(&tinfo->event);
+	wake_up_interruptible_all(&tinfo->terminate_event);
 	return 0;
 }

@ -586,17 +599,16 @@ static int aio_event_thread(void *data)
 	if (!current->mm)
 		goto err;

-	err = aio_start_thread(output, 2, aio_sync_thread);
+	err = aio_start_thread(output, &output->tinfo[2], aio_sync_thread, 'y');
 	if (unlikely(err < 0))
 		goto err;

-	while (!kthread_should_stop()) {
+	while (!kthread_should_stop() || tinfo->queued_sum > 0) {
 		mm_segment_t oldfs;
 		int count;
-		int bounced;
 		int i;
 		struct timespec timeout = {
-			.tv_sec = 10,
+			.tv_sec = 1,
 		};
 		struct io_event events[MARS_MAX_AIO_READ];

@ -609,7 +621,6 @@ static int aio_event_thread(void *data)
 		set_fs(oldfs);

 		//MARS_INF("count = %d\n", count);
-		bounced = 0;
 		for (i = 0; i < count; i++) {
 			struct aio_mref_aspect *mref_a = (void*)events[i].data;
 			struct mref_object *mref;
@ -631,7 +642,6 @@ static int aio_event_thread(void *data)
 				if (!output->filp->f_op->aio_fsync) {
 					mars_trace(mref, "aio_fsync");
 					_enqueue(other, mref_a, mref->ref_prio, true);
-					bounced++;
 					continue;
 				}
 				err = aio_submit(output, mref_a, true);
@ -642,8 +652,6 @@ static int aio_event_thread(void *data)
 			_complete(output, mref, err);

 		}
-		if (bounced)
-			wake_up_interruptible_all(&other->event);
 	}
 	err = 0;

@ -655,7 +663,7 @@ static int aio_event_thread(void *data)
 	unuse_fake_mm();

 	tinfo->terminated = true;
-	wake_up_interruptible_all(&tinfo->event);
+	wake_up_interruptible_all(&tinfo->terminate_event);
 	return err;
 }

@ -759,11 +767,11 @@ static int aio_submit_thread(void *data)
 	if (unlikely(err < 0))
 		goto cleanup_mm;
 	
-	err = aio_start_thread(output, 1, aio_event_thread);
+	err = aio_start_thread(output, &output->tinfo[1], aio_event_thread, 'e');
 	if (unlikely(err < 0))
 		goto cleanup_ctxp;

-	while (!kthread_should_stop() || atomic_read(&output->read_count) > 0 || atomic_read(&output->write_count) > 0) {
+	while (!kthread_should_stop() || atomic_read(&output->read_count) + atomic_read(&output->write_count) + tinfo->queued_sum > 0) {
 		struct aio_mref_aspect *mref_a;
 		struct mref_object *mref;
 		int sleeptime;
@ -773,11 +781,10 @@ static int aio_submit_thread(void *data)

 		wait_event_interruptible_timeout(
 			tinfo->event,
-			kthread_should_stop() ||
-			_dequeue(tinfo, false),
-			HZ);
+			tinfo->queued_sum > 0,
+			HZ / 4);

-		mref_a = _dequeue(tinfo, true);
+		mref_a = _dequeue(tinfo);
 		if (!mref_a) {
 			continue;
 		}
@ -803,7 +810,7 @@ static int aio_submit_thread(void *data)
 					mref_a->start_jiffies = jiffies;
 				}
 				if ((long long)jiffies - mref_a->start_jiffies <= mref->ref_timeout) {
-					if (!_dequeue(tinfo, false)) {
+					if (!tinfo->queued_sum) {
 						atomic_inc(&output->total_msleep_count);
 						brick_msleep(1000 * 4 / HZ);
 					}
@ -816,9 +823,9 @@ static int aio_submit_thread(void *data)
 			}
 		}

-		sleeptime = 1000 / HZ;
+		sleeptime = 1;
 		for (;;) {
-			/* This is just a test. Don't use it for performance reasons.
+			/* This is just a test. Don't enable it for performance reasons.
 			 */
 			if (output->brick->wait_during_fdsync && mref->ref_rw != READ) {
 				if (output->fdsync_active) {
@ -826,7 +833,7 @@ static int aio_submit_thread(void *data)
 					atomic_inc(&output->total_fdsync_wait_count);
 					__wait_event_interruptible_timeout(
 						output->fdsync_event,
-						!output->fdsync_active || kthread_should_stop(),
+						!output->fdsync_active,
 						delay);
 				}

@ -842,7 +849,7 @@ static int aio_submit_thread(void *data)
 			atomic_inc(&output->total_delay_count);
 			brick_msleep(sleeptime);
 			if (sleeptime < 100) {
-				sleeptime += 1000 / HZ;
+				sleeptime++;
 			}
 		}
 	err:
@ -876,7 +883,7 @@ cleanup_fd:
 done:
 	MARS_DBG("status = %d\n", err);
 	tinfo->terminated = true;
-	wake_up_interruptible_all(&tinfo->event);
+	wake_up_interruptible_all(&tinfo->terminate_event);
 	return err;
 }

@ -904,12 +911,9 @@ char *aio_statistics(struct aio_brick *brick, int verbose)
 	if (!res)
 		return NULL;

-#ifdef MEASURE_SYNC
-	sync = show_sync();
-#endif
-
 	pos += report_timing(&timings[0], res + pos, 4096 - pos);
 	pos += report_timing(&timings[1], res + pos, 4096 - pos);
+	pos += report_timing(&timings[2], res + pos, 4096 - pos);

 	snprintf(res + pos, 4096 - pos,
 		 "total "
@ -927,10 +931,14 @@ char *aio_statistics(struct aio_brick *brick, int verbose)
 		 "flying reads = %d "
 		 "writes = %d "
 		 "allocs = %d "
-		 "q0 = %d (%d - %d) "
-		 "q1 = %d (%d - %d) "
-		 "q2 = %d (%d - %d) |"
-		 " %s\n",
+		 "q0 = %d "
+		 "q1 = %d "
+		 "q2 = %d "
+		 "| total "
+		 "q0 = %d "
+		 "q1 = %d "
+		 "q2 = %d "
+		 "%s\n",
 		 atomic_read(&output->total_read_count),
 		 atomic_read(&output->total_write_count),
 		 atomic_read(&output->total_alloc_count),
@ -945,12 +953,12 @@ char *aio_statistics(struct aio_brick *brick, int verbose)
 		 atomic_read(&output->read_count),
 		 atomic_read(&output->write_count),
 		 atomic_read(&output->alloc_count),
-		 atomic_read(&output->tinfo[0].total_enqueue_count) - atomic_read(&output->tinfo[0].total_dequeue_count),
-		 atomic_read(&output->tinfo[0].total_enqueue_count), atomic_read(&output->tinfo[0].total_dequeue_count),
-		 atomic_read(&output->tinfo[1].total_enqueue_count) - atomic_read(&output->tinfo[1].total_dequeue_count),
-		 atomic_read(&output->tinfo[1].total_enqueue_count), atomic_read(&output->tinfo[1].total_dequeue_count),
-		 atomic_read(&output->tinfo[2].total_enqueue_count) - atomic_read(&output->tinfo[2].total_dequeue_count),
-		 atomic_read(&output->tinfo[2].total_enqueue_count), atomic_read(&output->tinfo[2].total_dequeue_count),
+		 output->tinfo[0].queued_sum,
+		 output->tinfo[1].queued_sum,
+		 output->tinfo[2].queued_sum,
+		 atomic_read(&output->tinfo[0].total_enqueue_count),
+		 atomic_read(&output->tinfo[1].total_enqueue_count),
+		 atomic_read(&output->tinfo[2].total_enqueue_count),
 		 sync ? sync : "");
 	
 	if (sync)
@ -975,7 +983,6 @@ void aio_reset_statistics(struct aio_brick *brick)
 	for (i = 0; i < 3; i++) {
 		struct aio_threadinfo *tinfo = &output->tinfo[i];
 		atomic_set(&tinfo->total_enqueue_count, 0);
-		atomic_set(&tinfo->total_dequeue_count, 0);
 	}
 }

@ -1006,6 +1013,7 @@ static int aio_brick_construct(struct aio_brick *brick)

 static int aio_switch(struct aio_brick *brick)
 {
+	static int index;
 	struct aio_output *output = brick->outputs[0];
 	const char *path = output->brick->brick_path;
 	int flags = O_CREAT | O_RDWR | O_LARGEFILE;
@ -1052,7 +1060,8 @@ static int aio_switch(struct aio_brick *brick)
 	}
 #endif

-	err = aio_start_thread(output, 0, aio_submit_thread);
+	output->index = ++index;
+	err = aio_start_thread(output, &output->tinfo[0], aio_submit_thread, 's');
 	if (err < 0)
 		goto err;

--- a/mars_aio.h
+++ b/mars_aio.h
@ -5,6 +5,15 @@
 #include <linux/aio.h>
 #include <linux/syscalls.h>

+#define AIO_SUBMIT_MAX_LATENCY    1000 //   1 ms
+#define AIO_IO_R_MAX_LATENCY     50000 //  50 ms
+#define AIO_IO_W_MAX_LATENCY    150000 // 150 ms
+#define AIO_SYNC_MAX_LATENCY    150000 // 150 ms
+
+extern struct threshold aio_submit_threshold;
+extern struct threshold aio_io_threshold[2];
+extern struct threshold aio_sync_threshold;
+
 //#define USE_CLEVER_SYNC // TODO: NYI (should result in better write performance)
 #ifdef USE_CLEVER_SYNC

@ -24,6 +33,7 @@ struct aio_mref_aspect {
 	struct pairing_heap_sync heap_head;
 #endif
 	struct list_head io_head;
+	unsigned long long enqueue_stamp;
 	long long start_jiffies;
 	int resubmit;
 	int alloc_len;
@ -50,10 +60,12 @@ struct aio_threadinfo {
 	struct aio_output *output;
 	struct task_struct *thread;
 	wait_queue_head_t event;
+	wait_queue_head_t terminate_event;
 	spinlock_t lock;
-	bool terminated;
+	int queued[MARS_PRIO_NR];
+	int queued_sum;
 	atomic_t total_enqueue_count;
-	atomic_t total_dequeue_count;
+	bool terminated;
 };

 struct aio_output {
@ -68,6 +80,7 @@ struct aio_output {
 	wait_queue_head_t fdsync_event;
 	bool fdsync_active;
 	// statistics
+	int index;
 	atomic_t total_read_count;
 	atomic_t total_write_count;
 	atomic_t total_alloc_count;
--- a/mars_bio.c
+++ b/mars_bio.c
@ -19,11 +19,35 @@
 #include "mars.h"
 #include "lib_timing.h"

+#include "mars_bio.h"
+
 static struct timing_stats timings[2] = {};

-///////////////////////// own type definitions ////////////////////////
+struct threshold bio_submit_threshold = {
+	.thr_ban = &mars_global_ban,
+	.thr_limit = BIO_SUBMIT_MAX_LATENCY,
+	.thr_factor = 100,
+	.thr_plus = 0,
+};
+EXPORT_SYMBOL_GPL(bio_submit_threshold);

-#include "mars_bio.h"
+struct threshold bio_io_threshold[2] = {
+	[0] = {
+		.thr_ban = &mars_global_ban,
+		.thr_limit = BIO_IO_R_MAX_LATENCY,
+		.thr_factor = 10,
+		.thr_plus = 10000,
+	},
+	[1] = {
+		.thr_ban = &mars_global_ban,
+		.thr_limit = BIO_IO_W_MAX_LATENCY,
+		.thr_factor = 10,
+		.thr_plus = 10000,
+	},
+};
+EXPORT_SYMBOL_GPL(bio_io_threshold);
+
+///////////////////////// own type definitions ////////////////////////

 static void bio_ref_put(struct bio_output *output, struct mref_object *mref);

@ -46,10 +70,9 @@ void bio_callback(struct bio *bio, int code)
 	mref_a->status_code = code;

 	spin_lock_irqsave(&brick->lock, flags);
-	if (list_empty(&mref_a->io_head)) {
-		list_add_tail(&mref_a->io_head, &brick->completed_list);
-		atomic_inc(&brick->completed_count);
-	}
+	list_del(&mref_a->io_head);
+	list_add_tail(&mref_a->io_head, &brick->completed_list);
+	atomic_inc(&brick->completed_count);
 	spin_unlock_irqrestore(&brick->lock, flags);

 	wake_up_interruptible(&brick->response_event);
@ -303,6 +326,8 @@ void _bio_ref_io(struct bio_output *output, struct mref_object *mref, bool cork)
 	struct bio_brick *brick = output->brick;
 	struct bio_mref_aspect *mref_a = bio_mref_get_aspect(output->brick, mref);
 	struct bio *bio;
+	unsigned long long latency;
+	unsigned long flags;
 	int rw;
 	int status = -EINVAL;

@ -338,13 +363,23 @@ void _bio_ref_io(struct bio_output *output, struct mref_object *mref, bool cork)
 	MARS_IO("starting IO rw = %d prio 0 %d fly = %d\n", rw, mref->ref_prio, atomic_read(&brick->fly_count[PRIO_INDEX(mref)]));
 	mars_trace(mref, "bio_submit");

+	mref_a->start_stamp = cpu_clock(raw_smp_processor_id());
+	spin_lock_irqsave(&brick->lock, flags);
+	list_add_tail(&mref_a->io_head, &brick->submitted_list[rw & 1]);
+	spin_unlock_irqrestore(&brick->lock, flags);
+
 #ifdef FAKE_IO
 	bio->bi_end_io(bio, 0);
 #else
 	bio->bi_rw = rw;
-	TIME_STATS(&timings[rw & 1], submit_bio(rw, bio));
+	latency = TIME_STATS(
+		&timings[rw & 1],
+		submit_bio(rw, bio)
+		);
 #endif

+	threshold_check(&bio_submit_threshold, latency);
+
 	status = 0;
 	if (unlikely(bio_flagged(bio, BIO_EOPNOTSUPP)))
 		status = -EOPNOTSUPP;
@ -401,16 +436,30 @@ int bio_response_thread(void *data)
 	for (;;) {
 		LIST_HEAD(tmp_list);
 		unsigned long flags;
+		int thr_limit;
+		int sleeptime;
 		int count;
+		int i;
+
+		thr_limit = bio_io_threshold[0].thr_limit;
+		if (bio_io_threshold[1].thr_limit < thr_limit)
+			thr_limit = bio_io_threshold[1].thr_limit;
+
+		sleeptime = HZ / 10;
+		if (thr_limit > 0) {
+			sleeptime = thr_limit / (1000000 * 2 / HZ);
+			if (unlikely(sleeptime < 2))
+				sleeptime = 2;
+		}

 #ifdef IO_DEBUGGING
 		round++;
-		MARS_IO("%d sleeping...\n", round);
+		MARS_IO("%d sleeping %d...\n", round, sleeptime);
 #endif
 		wait_event_interruptible_timeout(
 			brick->response_event,
 			atomic_read(&brick->completed_count) > 0,
-			HZ);
+			sleeptime);

 		MARS_IO("%d woken up, completed_count = %d fly_count[0] = %d fly_count[1] = %d fly_count[2] = %d\n",
 			round,
@ -428,6 +477,7 @@ int bio_response_thread(void *data)
 			struct list_head *tmp;
 			struct bio_mref_aspect *mref_a;
 			struct mref_object *mref;
+			unsigned long long latency;
 			int code;

 			if (list_empty(&tmp_list)) {
@ -439,16 +489,20 @@ int bio_response_thread(void *data)
 			tmp = tmp_list.next;
 			list_del_init(tmp);
 			atomic_dec(&brick->completed_count);
+
 			mref_a = container_of(tmp, struct bio_mref_aspect, io_head);
+			mref = mref_a->object;
+
 			
+			latency = cpu_clock(raw_smp_processor_id()) - mref_a->start_stamp;
+			threshold_check(&bio_io_threshold[mref->ref_rw & 1], latency);
+
 			code = mref_a->status_code;
 #ifdef IO_DEBUGGING
 			round++;
 			MARS_IO("%d completed , status = %d\n", round, code);
 #endif
 		
-			mref = mref_a->object;
-
 			mars_trace(mref, "bio_endio");

 			if (code < 0) {
@ -473,6 +527,26 @@ int bio_response_thread(void *data)
 			}
 			bio_ref_put(mref_a->output, mref);
 		}
+
+		/* Try to detect slow requests as early as possible,
+		 * even before they have completed.
+		 */
+		for (i = 0; i < 2; i++) {
+			unsigned long long eldest = 0;
+
+			spin_lock_irqsave(&brick->lock, flags);
+			if (!list_empty(&brick->submitted_list[i])) {
+				struct bio_mref_aspect *mref_a;
+				mref_a = container_of(brick->submitted_list[i].next, struct bio_mref_aspect, io_head);
+				eldest = mref_a->start_stamp;
+			}
+			spin_unlock_irqrestore(&brick->lock, flags);
+
+			if (eldest) {
+				threshold_check(&bio_io_threshold[i], cpu_clock(raw_smp_processor_id()) - eldest);
+			}
+		}
+
 		if (count) {
 			brick->submitted = true;
 			wake_up_interruptible(&brick->submit_event);
@ -510,7 +584,7 @@ int bio_submit_thread(void *data)
 		wait_event_interruptible_timeout(
 			brick->submit_event,
 			brick->submitted,
-			HZ);
+			HZ / 2);

 		brick->submitted = false;

@ -719,6 +793,8 @@ static int bio_brick_construct(struct bio_brick *brick)
 	INIT_LIST_HEAD(&brick->queue_list[0]);
 	INIT_LIST_HEAD(&brick->queue_list[1]);
 	INIT_LIST_HEAD(&brick->queue_list[2]);
+	INIT_LIST_HEAD(&brick->submitted_list[0]);
+	INIT_LIST_HEAD(&brick->submitted_list[1]);
 	INIT_LIST_HEAD(&brick->completed_list);
 	init_waitqueue_head(&brick->submit_event);
 	init_waitqueue_head(&brick->response_event);
--- a/mars_bio.h
+++ b/mars_bio.h
@ -2,6 +2,13 @@
 #ifndef MARS_BIO_H
 #define MARS_BIO_H

+#define BIO_SUBMIT_MAX_LATENCY    250 // 250 us
+#define BIO_IO_R_MAX_LATENCY    40000 //  40 ms
+#define BIO_IO_W_MAX_LATENCY   100000 // 100 ms
+
+extern struct threshold bio_submit_threshold;
+extern struct threshold bio_io_threshold[2];
+
 #include <linux/blkdev.h>

 struct bio_mref_aspect {
@ -9,6 +16,7 @@ struct bio_mref_aspect {
 	struct list_head io_head;
 	struct bio *bio;
 	struct bio_output *output;
+	unsigned long long start_stamp;
 	int status_code;
 	int hash_pos;
 	int alloc_len;
@ -33,6 +41,7 @@ struct bio_brick {
 	// private
 	spinlock_t lock;
 	struct list_head queue_list[MARS_PRIO_NR];
+	struct list_head submitted_list[2];
 	struct list_head completed_list;
 	wait_queue_head_t submit_event;
 	wait_queue_head_t response_event;
--- a/mars_generic.c
+++ b/mars_generic.c
@ -18,6 +18,9 @@

 // infrastructure

+struct banning mars_global_ban = {};
+EXPORT_SYMBOL_GPL(mars_global_ban);
+
 static char *id = NULL;

 /* TODO: better use MAC addresses (or motherboard IDs where available).
--- a/mars_trans_logger.c
+++ b/mars_trans_logger.c
@ -11,12 +11,12 @@

 // variants
 #define KEEP_UNIQUE
-//#define LATER
 #define DELAY_CALLERS // this is _needed_ for production systems
 //#define WB_COPY // unnecessary (only costs performance)
 //#define LATE_COMPLETE // unnecessary (only costs performance)
 //#define EARLY_COMPLETION
 //#define OLD_POSCOMPLETE
+#define SHORTCUT_1_to_3 // when possible, queue 1 executes phase3_startio() directly without intermediate queueing into queue 3 => may be irritating, but has better performance. NOTICE: when some day the IO scheduling should be different between queue 1 and 3, you MUST disable this in order to distinguish between them!

 // commenting this out is dangerous for data integrity! use only for testing!
 #define USE_MEMCPY
@ -33,23 +33,79 @@
 #include "lib_rank.h"
 #include "lib_limiter.h"

+#include "mars_trans_logger.h"
+
 #ifdef REPLAY_DEBUGGING
 #define MARS_RPL(_fmt, _args...)  _MARS_MSG(false, "REPLAY ", _fmt, ##_args)
 #else
 #define MARS_RPL(_args...) /*empty*/
 #endif

+#if 0
+#define inline noinline
+#endif
+
+///////////////////////// global tuning ////////////////////////
+
 int trans_logger_mem_usage; // in KB
 EXPORT_SYMBOL_GPL(trans_logger_mem_usage);

+struct writeback_group global_writeback = {
+	.lock = __RW_LOCK_UNLOCKED(global_writeback.lock),
+	.group_anchor = LIST_HEAD_INIT(global_writeback.group_anchor),
+	.until_percent = 30,
+};
+EXPORT_SYMBOL_GPL(global_writeback);
+
+static
+void add_to_group(struct writeback_group *gr, struct trans_logger_brick *brick)
+{
+	write_lock(&gr->lock);
+	list_add_tail(&brick->group_head, &gr->group_anchor);
+	write_unlock(&gr->lock);
+}
+
+static
+void remove_from_group(struct writeback_group *gr, struct trans_logger_brick *brick)
+{
+	write_lock(&gr->lock);
+	list_del_init(&brick->group_head);
+	gr->leader = NULL;
+	write_unlock(&gr->lock);
+}
+
+static
+struct trans_logger_brick *elect_leader(struct writeback_group *gr)
+{
+	struct trans_logger_brick *res = gr->leader;
+	struct list_head *tmp;
+
+	if (res && gr->until_percent >= 0) {
+		loff_t used = atomic64_read(&res->shadow_mem_used);
+		if (used > gr->biggest * gr->until_percent / 100)
+			goto done;
+	}
+
+	read_lock(&gr->lock);
+	for (tmp = gr->group_anchor.next; tmp != &gr->group_anchor; tmp = tmp->next) {
+		struct trans_logger_brick *test = container_of(tmp, struct trans_logger_brick, group_head);
+		loff_t new_used = atomic64_read(&test->shadow_mem_used);
+
+		if (!res || new_used > atomic64_read(&res->shadow_mem_used)) {
+			res = test;
+			gr->biggest = new_used;
+		}
+	}
+	read_unlock(&gr->lock);
+
+	gr->leader = res;
+
+done:
+	return res;
+}
+
 ///////////////////////// own type definitions ////////////////////////

-#include "mars_trans_logger.h"
-
-#if 1
-#define inline noinline
-#endif
-
 static inline
 int lh_cmp(loff_t *a, loff_t *b)
 {
@ -1587,7 +1643,7 @@ bool phase1_startio(struct trans_logger_mref_aspect *orig_mref_a)
 		qq_inc_flying(&brick->q_phase[1]);
 		fire_writeback(&wb->w_sub_read_list, false);
 	} else { // shortcut
-#ifdef LATER
+#ifndef SHORTCUT_1_to_3
 		qq_wb_insert(&brick->q_phase[3], wb);
 		wake_up_interruptible_all(&brick->worker_event);
 #else
@ -1817,9 +1873,10 @@ bool phase3_startio(struct writeback_info *wb)
 */

 static noinline
-int run_mref_queue(struct logger_queue *q, bool (*startio)(struct trans_logger_mref_aspect *sub_mref_a), int max)
+int run_mref_queue(struct logger_queue *q, bool (*startio)(struct trans_logger_mref_aspect *sub_mref_a), int max, bool do_limit)
 {
 	struct trans_logger_brick *brick = q->q_brick;
+	int total_len = 0;
 	bool found = false;
 	bool ok;
 	int res = 0;
@ -1830,6 +1887,9 @@ int run_mref_queue(struct logger_queue *q, bool (*startio)(struct trans_logger_m
 		if (!mref_a)
 			goto done;

+		if (do_limit && likely(mref_a->object))
+			total_len += mref_a->object->ref_len;
+
 		ok = startio(mref_a);
 		if (unlikely(!ok)) {
 			qq_mref_pushback(q, mref_a);
@ -1842,6 +1902,7 @@ int run_mref_queue(struct logger_queue *q, bool (*startio)(struct trans_logger_m

 done:
 	if (found) {
+		mars_limit(&global_writeback.limiter, (total_len - 1) / 1024 + 1);
 		wake_up_interruptible_all(&brick->worker_event);
 	}
 	return res;
@ -1851,6 +1912,7 @@ static noinline
 int run_wb_queue(struct logger_queue *q, bool (*startio)(struct writeback_info *wb), int max)
 {
 	struct trans_logger_brick *brick = q->q_brick;
+	int total_len = 0;
 	bool found = false;
 	bool ok;
 	int res = 0;
@ -1861,6 +1923,8 @@ int run_wb_queue(struct logger_queue *q, bool (*startio)(struct writeback_info *
 		if (!wb)
 			goto done;

+		total_len += wb->w_len;
+
 		ok = startio(wb);
 		if (unlikely(!ok)) {
 			qq_wb_pushback(q, wb);
@ -1872,6 +1936,7 @@ int run_wb_queue(struct logger_queue *q, bool (*startio)(struct writeback_info *

 done:
 	if (found) {
+		mars_limit(&global_writeback.limiter, (total_len - 1) / 1024 + 1);
 		wake_up_interruptible_all(&brick->worker_event);
 	}
 	return res;
@ -1890,58 +1955,166 @@ int _congested(struct trans_logger_brick *brick)
 		|| atomic_read(&brick->q_phase[3].q_flying);
 }

-static const
-struct rank_info rank0[] = {
-	{    0,  100 },
-	{  100,  200 },
-};
-
-static const
-struct rank_info rank1[] = {
-	{    0,   10 },
-	{  100,   20 },
-};
-
-static const
-struct rank_info rank2[] = {
-	{    0,   10 },
-	{  100,   20 },
-};
-
-static const
-struct rank_info rank3[] = {
-	{    0,   10 },
-	{  100,   20 },
-};
-
-/* In general, each individual ranking table may have a different length.
+/* Ranking tables.
 */
-static const
-struct rank_info *ranks[LOGGER_QUEUES] = {
-	[0] = rank0,
-	[1] = rank1,
-	[2] = rank2,
-	[3] = rank3,
+static
+struct rank_info float_queue_rank_log[] = {
+	{     0,    0 },
+	{     1,  100 },
+	{ 10000,  100 },
+	{ RKI_DUMMY }
 };

-static const
-int rank_counts[LOGGER_QUEUES] = {
-	[0] = sizeof(rank0) / sizeof(struct rank_info),
-	[1] = sizeof(rank1) / sizeof(struct rank_info),
-	[2] = sizeof(rank2) / sizeof(struct rank_info),
-	[3] = sizeof(rank3) / sizeof(struct rank_info),
+static
+struct rank_info float_queue_rank_io[] = {
+	{     0,    0 },
+	{     1,    1 },
+	{ 10000,    1 },
+	{ RKI_DUMMY }
+};
+
+static
+struct rank_info float_fly_rank_log[] = {
+	{     0,    0 },
+	{    32,   10 },
+	{ 10000,   10 },
+	{ RKI_DUMMY }
+};
+
+static
+struct rank_info float_fly_rank_io[] = {
+	{     0,    0 },
+	{     1,   10 },
+	{     2,  -20 },
+	{ 10000, -100 },
+	{ RKI_DUMMY }
+};
+
+
+static
+struct rank_info nofloat_queue_rank_log[] = {
+	{     0,    0 },
+	{     1,  100 },
+	{   100,   10 },
+	{ 10000,   10 },
+	{ RKI_DUMMY }
+};
+
+static
+struct rank_info nofloat_queue_rank_io[] = {
+	{     0,    0 },
+	{     1,   10 },
+	{   100,  100 },
+	{ 10000,  200 },
+	{ RKI_DUMMY }
+};
+
+static
+struct rank_info nofloat_fly_rank_log[] = {
+	{     0,    0 },
+	{     1,    1 },
+	{    32,   10 },
+	{ 10000,    1 },
+	{ RKI_DUMMY }
+};
+
+static
+struct rank_info nofloat_fly_rank_io[] = {
+	{     0,    0 },
+	{     1,   10 },
+	{   128,    8 },
+	{   129, -100 },
+	{ 10000, -200 },
+	{ RKI_DUMMY }
+};
+
+
+static
+struct rank_info *queue_ranks[2][LOGGER_QUEUES] = {
+	[0] = {
+		[0] = float_queue_rank_log,
+		[1] = float_queue_rank_io,
+		[2] = float_queue_rank_io,
+		[3] = float_queue_rank_io,
+	},
+	[1] = {
+		[0] = nofloat_queue_rank_log,
+		[1] = nofloat_queue_rank_io,
+		[2] = nofloat_queue_rank_io,
+		[3] = nofloat_queue_rank_io,
+	},
+};
+static
+struct rank_info *fly_ranks[2][LOGGER_QUEUES] = {
+	[0] = {
+		[0] = float_fly_rank_log,
+		[1] = float_fly_rank_io,
+		[2] = float_fly_rank_io,
+		[3] = float_fly_rank_io,
+	},
+	[1] = {
+		[0] = nofloat_fly_rank_log,
+		[1] = nofloat_fly_rank_io,
+		[2] = nofloat_fly_rank_io,
+		[3] = nofloat_fly_rank_io,
+	},
 };

 static noinline
 int _do_ranking(struct trans_logger_brick *brick, struct rank_data rkd[])
 {
 	int i;
-#ifdef DELAY_CALLERS
+	int floating_mode;
 	bool delay_callers;
-#endif

 	ranking_start(rkd, LOGGER_QUEUES);

+	// check the memory situation...
+	delay_callers = false;
+	floating_mode = 1;
+	if (brick_global_memlimit >= 1024) {
+		struct rank_info full_punish_global[] = {
+			{ 0,                                    0 },
+			{ brick_global_memlimit * 3 / 4,        0 },
+			{ brick_global_memlimit,            -1000 },
+			{ RKI_DUMMY }
+		};
+		int global_mem_used  = atomic64_read(&global_mshadow_used) / 1024;
+		trans_logger_mem_usage = global_mem_used;
+
+		floating_mode = (global_mem_used < brick_global_memlimit / 2) ? 0 : 1;
+
+		if (global_mem_used >= brick_global_memlimit)
+			delay_callers = true;
+
+		MARS_IO("global_mem_used = %d\n", global_mem_used);
+		ranking_compute(&rkd[0], full_punish_global, global_mem_used);
+	} else if (brick->shadow_mem_limit >= 8) {
+		struct rank_info full_punish_local[] = {
+			{ 0,                                    0 },
+			{ brick->shadow_mem_limit * 3 / 4,      0 },
+			{ brick->shadow_mem_limit,          -1000 },
+			{ RKI_DUMMY }
+		};
+		int local_mem_used   = atomic64_read(&brick->shadow_mem_used) / 1024;
+
+		floating_mode = (local_mem_used < brick->shadow_mem_limit / 2) ? 0 : 1;
+
+		if (local_mem_used >= brick->shadow_mem_limit)
+			delay_callers = true;
+
+		MARS_IO("local_mem_used = %d\n", local_mem_used);
+		ranking_compute(&rkd[0], full_punish_local,  local_mem_used);
+	}
+	if (delay_callers) {
+		if (!brick->delay_callers) {
+			brick->delay_callers = true;
+			atomic_inc(&brick->total_delay_count);
+		}
+	} else {
+		brick->delay_callers = false;
+	}
+
 	// obey the basic rules...
 	for (i = 0; i < LOGGER_QUEUES; i++) {
 		int queued = atomic_read(&brick->q_phase[i].q_queued);
@ -1949,62 +2122,44 @@ int _do_ranking(struct trans_logger_brick *brick, struct rank_data rkd[])

 		MARS_IO("i = %d queued = %d\n", i, queued);

+		/* This must come first.
+		 * When a queue is empty, you must not credit any positive points.
+		 * Otherwise, (almost) infinite selection of untreatable
+		 * queues may occur.
+		 */
 		if (queued <= 0)
 			continue;

-		flying = atomic_read(&brick->q_phase[i].q_flying);
-		if (flying >= brick->q_phase[i].q_max_flying && brick->q_phase[i].q_max_flying > 0)
-			continue;
+		if (i == 1 && !floating_mode) {
+			int lim;

-		MARS_IO("i = %d queued = %d\n", i, queued);
-		ranking_compute(&rkd[i], ranks[i], rank_counts[i], queued);
+			if (atomic_read(&brick->q_phase[0].q_queued) + atomic_read(&brick->q_phase[0].q_flying) > 0) {
+				break;
+			}

-		// ... and the contention rule for queue 0 ...
-		if (i == 0 && brick->q_phase[0].q_max_flying > 0) {
-			struct rank_info contention[] = {
-				{ 0,                                    0 },
-				{ brick->q_phase[0].q_max_flying,     300 },
-			};
-			
-			MARS_IO("flying = %d\n", flying);
-			ranking_compute(&rkd[0], contention,  2, flying);
+			if (elect_leader(&global_writeback) != brick) {
+				break;
+			}
+
+			if (banning_is_hit(&mars_global_ban)) {
+				break;
+			}
+
+			lim = mars_limit(&global_writeback.limiter, 0);
+			if (lim > 0) {
+				break;
+			}
 		}
+
+		ranking_compute(&rkd[i], queue_ranks[floating_mode][i], queued);
+
+		flying = atomic_read(&brick->q_phase[i].q_flying);
+
+		MARS_IO("i = %d queued = %d flying = %d\n", i, queued, flying);
+
+		ranking_compute(&rkd[i], fly_ranks[floating_mode][i], flying);
 	}

-	// ... and now the exceptions from the rules ...
-#ifdef DELAY_CALLERS
-	delay_callers = false;
-	if (brick_global_memlimit >= 1024) {
-		struct rank_info full_punish_global[] = {
-			{ 0,                                    0 },
-			{ brick_global_memlimit * 7 / 8,        0 },
-			{ brick_global_memlimit,            -1000 },
-		};
-		int global_mem_used  = atomic64_read(&global_mshadow_used) / 1024;
-		trans_logger_mem_usage = global_mem_used;
-
-		if (global_mem_used >= brick_global_memlimit)
-			delay_callers = true;
-
-		MARS_IO("global_mem_used = %d\n", global_mem_used);
-		ranking_compute(&rkd[0], full_punish_global, 3, global_mem_used);
-	} else if (brick->shadow_mem_limit >= 8) {
-		struct rank_info full_punish_local[] = {
-			{ 0,                                    0 },
-			{ brick->shadow_mem_limit * 7 / 8,      0 },
-			{ brick->shadow_mem_limit,          -1000 },
-		};
-		int local_mem_used   = atomic64_read(&brick->shadow_mem_used) / 1024;
-
-		if (local_mem_used >= brick->shadow_mem_limit)
-			delay_callers = true;
-
-		MARS_IO("local_mem_used = %d\n", local_mem_used);
-		ranking_compute(&rkd[0], full_punish_local,  3, local_mem_used);
-	}
-	brick->delay_callers = delay_callers;
-#endif
-
 	// finalize it
 	ranking_stop(rkd, LOGGER_QUEUES);

@ -2137,10 +2292,10 @@ void trans_logger_log(struct trans_logger_brick *brick)

 		switch (winner) {
 		case 0:
-			nr = run_mref_queue(&brick->q_phase[0], prep_phase_startio, brick->q_phase[0].q_batchlen);
+			nr = run_mref_queue(&brick->q_phase[0], prep_phase_startio, brick->q_phase[0].q_batchlen, true);
 			goto done;
 		case 1:
-			nr = run_mref_queue(&brick->q_phase[1], phase1_startio, brick->q_phase[1].q_batchlen);
+			nr = run_mref_queue(&brick->q_phase[1], phase1_startio, brick->q_phase[1].q_batchlen, true);
 			goto done;
 		case 2:
 			nr = run_wb_queue(&brick->q_phase[2], phase2_startio, brick->q_phase[2].q_batchlen);
@ -2711,6 +2866,7 @@ int trans_logger_brick_construct(struct trans_logger_brick *brick)
 	atomic_set(&brick->hash_count, 0);
 	spin_lock_init(&brick->replay_lock);
 	INIT_LIST_HEAD(&brick->replay_list);
+	INIT_LIST_HEAD(&brick->group_head);
 	init_waitqueue_head(&brick->worker_event);
 	init_waitqueue_head(&brick->caller_event);
 	qq_init(&brick->q_phase[0], brick);
@ -2732,6 +2888,20 @@ int trans_logger_brick_construct(struct trans_logger_brick *brick)
 	brick->new_input_nr = TL_INPUT_LOG1;
 	brick->log_input_nr = TL_INPUT_LOG1;
 	brick->old_input_nr = TL_INPUT_LOG1;
+	add_to_group(&global_writeback, brick);
+	return 0;
+}
+
+static noinline
+int trans_logger_brick_destruct(struct trans_logger_brick *brick)
+{
+	int i;
+	for (i = 0; i < TRANS_HASH_MAX; i++) {
+		struct hash_anchor *start = &brick->hash_table[i];
+		CHECK_HEAD_EMPTY(&start->hash_anchor);
+	}
+	CHECK_HEAD_EMPTY(&brick->replay_list);
+	remove_from_group(&global_writeback, brick);
 	return 0;
 }

@ -2810,6 +2980,7 @@ const struct trans_logger_brick_type trans_logger_brick_type = {
 	.default_input_types = trans_logger_input_types,
 	.default_output_types = trans_logger_output_types,
 	.brick_construct = &trans_logger_brick_construct,
+	.brick_destruct = &trans_logger_brick_destruct,
 };
 EXPORT_SYMBOL_GPL(trans_logger_brick_type);

--- a/mars_trans_logger.h
+++ b/mars_trans_logger.h
@ -10,12 +10,27 @@

 #include <linux/time.h>

+#include "mars.h"
 #include "lib_log.h"
 #include "lib_pairing_heap.h"
 #include "lib_queue.h"

+///////////////////////// global tuning ////////////////////////
+
 extern int trans_logger_mem_usage; // in KB

+struct writeback_group {
+	rwlock_t lock;
+	struct trans_logger_brick *leader;
+	loff_t biggest;
+	struct list_head group_anchor;
+	// tuning
+	struct mars_limiter limiter;
+	int until_percent;
+};
+
+extern struct writeback_group global_writeback;
+
 ////////////////////////////////////////////////////////////////////

 _PAIRING_HEAP_TYPEDEF(logger,)
@ -133,6 +148,7 @@ struct trans_logger_brick {
 	int old_input_nr;   // where old IO requests may be on the fly
 	int replay_code;    // replay errors (if any)
 	// private
+	struct list_head group_head;
 	loff_t old_margin;
 	spinlock_t replay_lock;
 	struct list_head replay_list;
--- a/sy_old/mars_light.c
+++ b/sy_old/mars_light.c
@ -120,7 +120,7 @@ struct mars_rotate {

 // TUNING

-int mars_mem_percent = 0;
+int mars_mem_percent = 20;
 EXPORT_SYMBOL_GPL(mars_mem_percent);

 #define CONF_TRANS_SHADOW_LIMIT (1024 * 128) // don't fill the hashtable too much
@ -132,15 +132,13 @@ EXPORT_SYMBOL_GPL(mars_mem_percent);
 //#define TRANS_FAKE

 #define CONF_TRANS_BATCHLEN 64
-#define CONF_TRANS_FLYING 256
 #define CONF_TRANS_PRIO   MARS_PRIO_HIGH
 #define CONF_TRANS_LOG_READS false
 //#define CONF_TRANS_LOG_READS true
 //#define CONF_TRANS_COMPLETION_SEMANTICS 2
 #define CONF_TRANS_COMPLETION_SEMANTICS 0

-#define CONF_ALL_BATCHLEN 4
-#define CONF_ALL_FLYING 32
+#define CONF_ALL_BATCHLEN 1
 #define CONF_ALL_PRIO   MARS_PRIO_NORMAL

 #define IF_SKIP_SYNC true
@ -186,11 +184,6 @@ int _set_trans_params(struct mars_brick *_brick, void *private)
 		trans_brick->q_phase[2].q_batchlen = CONF_ALL_BATCHLEN;
 		trans_brick->q_phase[3].q_batchlen = CONF_ALL_BATCHLEN;

-		trans_brick->q_phase[0].q_max_flying = CONF_TRANS_FLYING;
-		trans_brick->q_phase[1].q_max_flying = CONF_ALL_FLYING;
-		trans_brick->q_phase[2].q_max_flying = CONF_ALL_FLYING;
-		trans_brick->q_phase[3].q_max_flying = CONF_ALL_FLYING;
-
 		trans_brick->q_phase[0].q_io_prio = CONF_TRANS_PRIO;
 		trans_brick->q_phase[1].q_io_prio = CONF_ALL_PRIO;
 		trans_brick->q_phase[2].q_io_prio = CONF_ALL_PRIO;
@ -3857,7 +3850,7 @@ static int light_thread(void *data)
 			mars_mem_percent = 0;
 		if (mars_mem_percent > 70)
 			mars_mem_percent = 70;
-		brick_global_memlimit = brick_global_memavail * mars_mem_percent / 100;
+		brick_global_memlimit = (long long)brick_global_memavail * mars_mem_percent / 100;

 		brick_msleep(100);

@ -3918,6 +3911,8 @@ static int light_thread(void *data)
 		_show_statist(&_global);
 #endif

+		MARS_DBG("ban_count = %d ban_renew_count = %d\n", mars_global_ban.ban_count, mars_global_ban.ban_renew_count);
+
 		brick_msleep(500);

 		wait_event_interruptible_timeout(_global.main_event, _global.main_trigger, CONFIG_MARS_SCAN_INTERVAL * HZ);
--- a/sy_old/mars_proc.c
+++ b/sy_old/mars_proc.c
@ -13,6 +13,8 @@

 #include "strategy.h"
 #include "mars_proc.h"
+#include "../mars_bio.h"
+#include "../mars_aio.h"
 #include "../mars_client.h"
 #include "../mars_server.h"
 #include "../mars_trans_logger.h"
@ -161,6 +163,44 @@ EXPORT_SYMBOL_GPL(mars_max_loadavg);
 #define _CTL_STRATEGY(handler)	/*empty*/
 #endif

+#define INT_ENTRY(NAME,VAR,MODE)			\
+	{						\
+		_CTL_NAME				\
+		.procname	= NAME,			\
+		.data           = &(VAR),		\
+		.maxlen         = sizeof(int),		\
+		.mode		= MODE,			\
+		.proc_handler	= &proc_dointvec,	\
+		_CTL_STRATEGY(sysctl_intvec)		\
+	}
+
+#define LIMITER_ENTRIES(VAR, PREFIX, SUFFIX)				\
+	INT_ENTRY(PREFIX "_limit_" SUFFIX, (VAR)->lim_max_rate, 0600),	\
+	INT_ENTRY(PREFIX "_rate_"  SUFFIX, (VAR)->lim_rate,     0600)	\
+
+#define THRESHOLD_ENTRIES(VAR, PREFIX)					\
+	INT_ENTRY(PREFIX "_threshold_us",   (VAR)->thr_limit,    0600),	\
+	INT_ENTRY(PREFIX "_factor_percent", (VAR)->thr_factor,   0600),	\
+	INT_ENTRY(PREFIX "_plus_us",        (VAR)->thr_plus,     0600),	\
+	INT_ENTRY(PREFIX "_triggered",      (VAR)->thr_triggered, 0600), \
+	INT_ENTRY(PREFIX "_true_hit",       (VAR)->thr_true_hit, 0600)	\
+
+static
+ctl_table tuning_table[] = {
+	LIMITER_ENTRIES(&client_limiter,           "network_traffic", "kb"),
+	LIMITER_ENTRIES(&server_limiter,           "server_io",       "kb"),
+	LIMITER_ENTRIES(&global_writeback.limiter, "writeback",       "kb"),
+	INT_ENTRY("writeback_until_percent", global_writeback.until_percent, 0600),
+	THRESHOLD_ENTRIES(&bio_submit_threshold, "bio_submit"),
+	THRESHOLD_ENTRIES(&bio_io_threshold[0],  "bio_io_r"),
+	THRESHOLD_ENTRIES(&bio_io_threshold[1],  "bio_io_w"),
+	THRESHOLD_ENTRIES(&aio_submit_threshold, "aio_submit"),
+	THRESHOLD_ENTRIES(&aio_io_threshold[0],  "aio_io_r"),
+	THRESHOLD_ENTRIES(&aio_io_threshold[1],  "aio_io_w"),
+	THRESHOLD_ENTRIES(&aio_sync_threshold,   "aio_sync"),
+	{}
+};
+
 static
 ctl_table mars_table[] = {
 	{
@ -181,106 +221,20 @@ ctl_table mars_table[] = {
 		.mode		= 0400,
 		.proc_handler	= &errors_sysctl_handler,
 	},
-	{
-		_CTL_NAME
-		.procname	= "percent_mem_limit_kb",
-		.data           = &mars_mem_percent,
-		.maxlen         = sizeof(int),
-		.mode		= 0600,
-		.proc_handler	= &proc_dointvec,
-		_CTL_STRATEGY(sysctl_intvec)
-	},
-	{
-		_CTL_NAME
-		.procname	= "mem_used_kb",
-		.data           = &trans_logger_mem_usage,
-		.maxlen         = sizeof(int),
-		.mode		= 0400,
-		.proc_handler	= &proc_dointvec,
-		_CTL_STRATEGY(sysctl_intvec)
-	},
-	{
-		_CTL_NAME
-		.procname	= "logrot_auto_gb",
-		.data           = &global_logrot_auto,
-		.maxlen         = sizeof(int),
-		.mode		= 0600,
-		.proc_handler	= &proc_dointvec,
-		_CTL_STRATEGY(sysctl_intvec)
-	},
-	{
-		_CTL_NAME
-		.procname	= "logdel_auto_gb",
-		.data           = &global_logdel_auto,
-		.maxlen         = sizeof(int),
-		.mode		= 0600,
-		.proc_handler	= &proc_dointvec,
-		_CTL_STRATEGY(sysctl_intvec)
-	},
-	{
-		_CTL_NAME
-		.procname	= "free_space_mb",
-		.data           = &global_free_space,
-		.maxlen         = sizeof(int),
-		.mode		= 0600,
-		.proc_handler	= &proc_dointvec,
-		_CTL_STRATEGY(sysctl_intvec)
-	},
+	INT_ENTRY("percent_mem_limit_kb", mars_mem_percent,       0600),
+	INT_ENTRY("mem_used_kb",          trans_logger_mem_usage, 0400),
+	INT_ENTRY("logrot_auto_gb",       global_logrot_auto,     0600),
+	INT_ENTRY("logdel_auto_gb",       global_logdel_auto,     0600),
+	INT_ENTRY("free_space_mb",        global_free_space,      0600),
 #ifdef CONFIG_MARS_LOADAVG_LIMIT
-	{
-		_CTL_NAME
-		.procname	= "loadavg_limit",
-		.data           = &mars_max_loadavg,
-		.maxlen         = sizeof(int),
-		.mode		= 0600,
-		.proc_handler	= &proc_dointvec,
-		_CTL_STRATEGY(sysctl_intvec)
-	},
+	INT_ENTRY("loadavg_limit",        mars_max_loadavg,       0600),
 #endif
+	INT_ENTRY("network_io_timeout",   global_net_io_timeout,  0600),
 	{
 		_CTL_NAME
-		.procname	= "network_io_timeout",
-		.data           = &global_net_io_timeout,
-		.maxlen         = sizeof(int),
-		.mode		= 0600,
-		.proc_handler	= &proc_dointvec,
-		_CTL_STRATEGY(sysctl_intvec)
-	},
-	{
-		_CTL_NAME
-		.procname	= "network_traffic_limit_kb",
-		.data           = &client_limiter.lim_max_rate,
-		.maxlen         = sizeof(int),
-		.mode		= 0600,
-		.proc_handler	= &proc_dointvec,
-		_CTL_STRATEGY(sysctl_intvec)
-	},
-	{
-		_CTL_NAME
-		.procname	= "network_traffic_rate_kb",
-		.data           = &client_limiter.lim_rate,
-		.maxlen         = sizeof(int),
-		.mode		= 0400,
-		.proc_handler	= &proc_dointvec,
-		_CTL_STRATEGY(sysctl_intvec)
-	},
-	{
-		_CTL_NAME
-		.procname	= "server_io_limit_mb",
-		.data           = &server_limiter.lim_max_rate,
-		.maxlen         = sizeof(int),
-		.mode		= 0600,
-		.proc_handler	= &proc_dointvec,
-		_CTL_STRATEGY(sysctl_intvec)
-	},
-	{
-		_CTL_NAME
-		.procname	= "server_io_rate_mb",
-		.data           = &server_limiter.lim_rate,
-		.maxlen         = sizeof(int),
-		.mode		= 0400,
-		.proc_handler	= &proc_dointvec,
-		_CTL_STRATEGY(sysctl_intvec)
+		.procname	= "tuning",
+		.mode		= 0500,
+		.child = tuning_table,
 	},
 	{}
 };