From 8f6054b0cc38f47941856bf868be59acdd60292f Mon Sep 17 00:00:00 2001
From: Thomas Schoebel-Theuer <tst@1und1.de>
Date: Thu, 18 Apr 2013 16:25:04 +0200
Subject: [PATCH] trans_logger: call _flush_inputs() more often

The old behaviour could lead to starvation effects.
In particular, direct IO writes were affected.
---
 kernel/mars_trans_logger.c | 96 +++++++++++++++++++++++++++++---------
 kernel/mars_trans_logger.h |  1 +
 kernel/sy_old/mars_proc.c  |  1 +
 3 files changed, 75 insertions(+), 23 deletions(-)

diff --git a/kernel/mars_trans_logger.c b/kernel/mars_trans_logger.c
index fd73dca3..54f9e74c 100644
--- a/kernel/mars_trans_logger.c
+++ b/kernel/mars_trans_logger.c
@@ -77,6 +77,9 @@ EXPORT_SYMBOL_GPL(trans_logger_do_crc);
 int trans_logger_mem_usage; // in KB
 EXPORT_SYMBOL_GPL(trans_logger_mem_usage);
 
+int trans_logger_max_interleave = -1;
+EXPORT_SYMBOL_GPL(trans_logger_max_interleave);
+
 struct writeback_group global_writeback = {
 	.lock = __RW_LOCK_UNLOCKED(global_writeback.lock),
 	.group_anchor = LIST_HEAD_INIT(global_writeback.group_anchor),
@@ -2480,11 +2483,53 @@ void _exit_inputs(struct trans_logger_brick *brick, bool force)
 	}
 }
 
+/* Performance-critical:
+ * Calling log_flush() too often may result in
+ * increased overhead (and thus in lower throughput).
+ * Call it only when the IO scheduler need not do anything else.
+ * OTOH, calling it too seldom may hold back
+ * IO completion for the end user for too long time.
+ *
+ * Be careful to flush any leftovers in the log buffer, at least after
+ * some short delay.
+ *
+ * Description of flush_mode:
+ *  0 = flush unconditionally
+ *  1 = flush only when nothing can be appended to the transaction log
+ *  2 = see 1 && flush only when the user is waiting for an answer
+ *  3 = see 1 && not 2 && flush only when there is no other activity (background mode)
+ * Notice: 3 makes only sense for leftovers where the user is _not_ waiting for
+ */
+static inline
+void flush_inputs(struct trans_logger_brick *brick, int flush_mode)
+{
+	if (flush_mode < 1 ||
+	    // there is nothing to append any more
+	    (atomic_read(&brick->q_phase[0].q_queued) <= 0 &&
+	     // and the user is waiting for an answer
+	     (flush_mode < 2 ||
+	      atomic_read(&brick->log_fly_count) > 0 ||
+	     // else flush any leftovers in background, when there is no writeback activity
+	      (flush_mode == 3 &&
+	       atomic_read(&brick->q_phase[1].q_flying) + atomic_read(&brick->q_phase[3].q_flying) <= 0)))) {
+		MARS_IO("log_fly_count 0 %d q0 = %d q0 = %d q0 = %d q0 = %d\n",
+			atomic_read(&brick->log_fly_count),
+			atomic_read(&brick->q_phase[0].q_flying),
+			atomic_read(&brick->q_phase[1].q_flying),
+			atomic_read(&brick->q_phase[2].q_flying),
+			atomic_read(&brick->q_phase[3].q_flying)
+			);
+		_flush_inputs(brick);
+	}
+}
+
 static noinline
 void trans_logger_log(struct trans_logger_brick *brick)
 {
 	struct rank_data rkd[LOGGER_QUEUES] = {};
 	long long old_jiffies = jiffies;
+	long long work_jiffies = jiffies;
+	int interleave = 0;
 	int nr_flying;
 
 	brick->replay_code = 0; // indicates "running"
@@ -2499,10 +2544,19 @@ void trans_logger_log(struct trans_logger_brick *brick)
 
 		wait_event_interruptible_timeout(
 			brick->worker_event,
-			(winner = _do_ranking(brick, rkd)) >= 0,
-			1 * HZ);
-
-		MARS_IO("winner = %d\n", winner);
+			({
+				winner = _do_ranking(brick, rkd);
+				MARS_IO("winner = %d\n", winner);
+				if (winner < 0) { // no more work to do
+					int flush_mode = 2 - ((int)(jiffies - work_jiffies)) / (HZ * 2);
+					flush_inputs(brick, flush_mode);
+					interleave = 0;
+				} else { // reset the timer whenever something is to do
+					work_jiffies = jiffies;
+				}
+				winner >= 0;
+			}),
+			HZ / 10);
 
 		atomic_inc(&brick->total_round_count);
 
@@ -2510,16 +2564,28 @@ void trans_logger_log(struct trans_logger_brick *brick)
 
 		switch (winner) {
 		case 0:
+			interleave = 0;
 			nr = run_mref_queue(&brick->q_phase[0], prep_phase_startio, brick->q_phase[0].q_batchlen, true);
 			goto done;
 		case 1:
+			if (interleave >= trans_logger_max_interleave && trans_logger_max_interleave >= 0) {
+				interleave = 0;
+				flush_inputs(brick, 3);
+			}
 			nr = run_mref_queue(&brick->q_phase[1], phase1_startio, brick->q_phase[1].q_batchlen, true);
+			interleave += nr;
 			goto done;
 		case 2:
+			interleave = 0;
 			nr = run_wb_queue(&brick->q_phase[2], phase2_startio, brick->q_phase[2].q_batchlen);
 			goto done;
 		case 3:
+			if (interleave >= trans_logger_max_interleave && trans_logger_max_interleave >= 0) {
+				interleave = 0;
+				flush_inputs(brick, 3);
+			}
 			nr = run_wb_queue(&brick->q_phase[3], phase3_startio, brick->q_phase[3].q_batchlen);
+			interleave += nr;
 		done:
 			if (unlikely(nr <= 0)) {
 				/* This should not happen!
@@ -2527,30 +2593,14 @@ void trans_logger_log(struct trans_logger_brick *brick)
 				 * algorithm cannot foresee anything.
 				 */
 				brick->q_phase[winner].no_progress_count++;
-				banning_hit(&brick->q_phase[winner].q_banning, 1000000);
+				banning_hit(&brick->q_phase[winner].q_banning, 10000);
+				flush_inputs(brick, 0);
 			}
 			ranking_select_done(rkd, winner, nr);
 			break;
 
 		default:
-			/* Performance-critical:
-			 * Calling log_flush() too often may result in
-			 * increased overhead (and thus in lower throughput).
-			 * Call it only when the IO scheduler need no do anything else.
-			 * OTOH, calling it too seldom may hold back
-			 * IO completion for the end user for too long time.
-			 * Be careful to flush any leftovers in the log buffer.
-			 */
-			if (
-				// there is nothing to append any more
-				atomic_read(&brick->q_phase[0].q_queued) <= 0 &&
-				// and the user is waiting for an answer
-				(atomic_read(&brick->log_fly_count) > 0 ||
-				 // else flush any leftovers in background, when there is no other activity
-				 (atomic_read(&brick->q_phase[0].q_flying) + atomic_read(&brick->q_phase[2].q_flying) > 0 &&
-				  atomic_read(&brick->q_phase[1].q_flying) + atomic_read(&brick->q_phase[3].q_flying) <= 0))) {
-				_flush_inputs(brick);
-			}
+			;
 		}
 
 		/* Update symlinks even during pauses.
diff --git a/kernel/mars_trans_logger.h b/kernel/mars_trans_logger.h
index e408e7d9..14d0118a 100644
--- a/kernel/mars_trans_logger.h
+++ b/kernel/mars_trans_logger.h
@@ -23,6 +23,7 @@
 extern int trans_logger_completion_semantics;
 extern int trans_logger_do_crc;
 extern int trans_logger_mem_usage; // in KB
+extern int trans_logger_max_interleave;
 extern atomic_t   global_mshadow_count;
 extern atomic64_t global_mshadow_used;
 
diff --git a/kernel/sy_old/mars_proc.c b/kernel/sy_old/mars_proc.c
index d126fd51..6d33c98a 100644
--- a/kernel/sy_old/mars_proc.c
+++ b/kernel/sy_old/mars_proc.c
@@ -213,6 +213,7 @@ ctl_table mars_table[] = {
 	INT_ENTRY("syslog_max_class",     brick_say_syslog_max,   0600),
 	INT_ENTRY("delay_say_on_overflow",delay_say_on_overflow,  0600),
 	INT_ENTRY("mapfree_period_sec",   mapfree_period_sec,     0600),
+	INT_ENTRY("logger_max_interleave", trans_logger_max_interleave, 0600),
 	INT_ENTRY("mem_limit_percent",    mars_mem_percent,       0600),
 	INT_ENTRY("logger_mem_used_kb",   trans_logger_mem_usage, 0400),
 	INT_ENTRY("mem_used_raw_kb",      brick_global_block_used,0400),