From 8f6054b0cc38f47941856bf868be59acdd60292f Mon Sep 17 00:00:00 2001 From: Thomas Schoebel-Theuer Date: Thu, 18 Apr 2013 16:25:04 +0200 Subject: [PATCH] trans_logger: call _flush_inputs() more often The old behaviour could lead to starvation effects. In particular, direct IO writes were affected. --- kernel/mars_trans_logger.c | 96 +++++++++++++++++++++++++++++--------- kernel/mars_trans_logger.h | 1 + kernel/sy_old/mars_proc.c | 1 + 3 files changed, 75 insertions(+), 23 deletions(-) diff --git a/kernel/mars_trans_logger.c b/kernel/mars_trans_logger.c index fd73dca3..54f9e74c 100644 --- a/kernel/mars_trans_logger.c +++ b/kernel/mars_trans_logger.c @@ -77,6 +77,9 @@ EXPORT_SYMBOL_GPL(trans_logger_do_crc); int trans_logger_mem_usage; // in KB EXPORT_SYMBOL_GPL(trans_logger_mem_usage); +int trans_logger_max_interleave = -1; +EXPORT_SYMBOL_GPL(trans_logger_max_interleave); + struct writeback_group global_writeback = { .lock = __RW_LOCK_UNLOCKED(global_writeback.lock), .group_anchor = LIST_HEAD_INIT(global_writeback.group_anchor), @@ -2480,11 +2483,53 @@ void _exit_inputs(struct trans_logger_brick *brick, bool force) } } +/* Performance-critical: + * Calling log_flush() too often may result in + * increased overhead (and thus in lower throughput). + * Call it only when the IO scheduler need not do anything else. + * OTOH, calling it too seldom may hold back + * IO completion for the end user for too long time. + * + * Be careful to flush any leftovers in the log buffer, at least after + * some short delay. + * + * Description of flush_mode: + * 0 = flush unconditionally + * 1 = flush only when nothing can be appended to the transaction log + * 2 = see 1 && flush only when the user is waiting for an answer + * 3 = see 1 && not 2 && flush only when there is no other activity (background mode) + * Notice: 3 makes only sense for leftovers where the user is _not_ waiting for + */ +static inline +void flush_inputs(struct trans_logger_brick *brick, int flush_mode) +{ + if (flush_mode < 1 || + // there is nothing to append any more + (atomic_read(&brick->q_phase[0].q_queued) <= 0 && + // and the user is waiting for an answer + (flush_mode < 2 || + atomic_read(&brick->log_fly_count) > 0 || + // else flush any leftovers in background, when there is no writeback activity + (flush_mode == 3 && + atomic_read(&brick->q_phase[1].q_flying) + atomic_read(&brick->q_phase[3].q_flying) <= 0)))) { + MARS_IO("log_fly_count 0 %d q0 = %d q0 = %d q0 = %d q0 = %d\n", + atomic_read(&brick->log_fly_count), + atomic_read(&brick->q_phase[0].q_flying), + atomic_read(&brick->q_phase[1].q_flying), + atomic_read(&brick->q_phase[2].q_flying), + atomic_read(&brick->q_phase[3].q_flying) + ); + _flush_inputs(brick); + } +} + static noinline void trans_logger_log(struct trans_logger_brick *brick) { struct rank_data rkd[LOGGER_QUEUES] = {}; long long old_jiffies = jiffies; + long long work_jiffies = jiffies; + int interleave = 0; int nr_flying; brick->replay_code = 0; // indicates "running" @@ -2499,10 +2544,19 @@ void trans_logger_log(struct trans_logger_brick *brick) wait_event_interruptible_timeout( brick->worker_event, - (winner = _do_ranking(brick, rkd)) >= 0, - 1 * HZ); - - MARS_IO("winner = %d\n", winner); + ({ + winner = _do_ranking(brick, rkd); + MARS_IO("winner = %d\n", winner); + if (winner < 0) { // no more work to do + int flush_mode = 2 - ((int)(jiffies - work_jiffies)) / (HZ * 2); + flush_inputs(brick, flush_mode); + interleave = 0; + } else { // reset the timer whenever something is to do + work_jiffies = jiffies; + } + winner >= 0; + }), + HZ / 10); atomic_inc(&brick->total_round_count); @@ -2510,16 +2564,28 @@ void trans_logger_log(struct trans_logger_brick *brick) switch (winner) { case 0: + interleave = 0; nr = run_mref_queue(&brick->q_phase[0], prep_phase_startio, brick->q_phase[0].q_batchlen, true); goto done; case 1: + if (interleave >= trans_logger_max_interleave && trans_logger_max_interleave >= 0) { + interleave = 0; + flush_inputs(brick, 3); + } nr = run_mref_queue(&brick->q_phase[1], phase1_startio, brick->q_phase[1].q_batchlen, true); + interleave += nr; goto done; case 2: + interleave = 0; nr = run_wb_queue(&brick->q_phase[2], phase2_startio, brick->q_phase[2].q_batchlen); goto done; case 3: + if (interleave >= trans_logger_max_interleave && trans_logger_max_interleave >= 0) { + interleave = 0; + flush_inputs(brick, 3); + } nr = run_wb_queue(&brick->q_phase[3], phase3_startio, brick->q_phase[3].q_batchlen); + interleave += nr; done: if (unlikely(nr <= 0)) { /* This should not happen! @@ -2527,30 +2593,14 @@ void trans_logger_log(struct trans_logger_brick *brick) * algorithm cannot foresee anything. */ brick->q_phase[winner].no_progress_count++; - banning_hit(&brick->q_phase[winner].q_banning, 1000000); + banning_hit(&brick->q_phase[winner].q_banning, 10000); + flush_inputs(brick, 0); } ranking_select_done(rkd, winner, nr); break; default: - /* Performance-critical: - * Calling log_flush() too often may result in - * increased overhead (and thus in lower throughput). - * Call it only when the IO scheduler need no do anything else. - * OTOH, calling it too seldom may hold back - * IO completion for the end user for too long time. - * Be careful to flush any leftovers in the log buffer. - */ - if ( - // there is nothing to append any more - atomic_read(&brick->q_phase[0].q_queued) <= 0 && - // and the user is waiting for an answer - (atomic_read(&brick->log_fly_count) > 0 || - // else flush any leftovers in background, when there is no other activity - (atomic_read(&brick->q_phase[0].q_flying) + atomic_read(&brick->q_phase[2].q_flying) > 0 && - atomic_read(&brick->q_phase[1].q_flying) + atomic_read(&brick->q_phase[3].q_flying) <= 0))) { - _flush_inputs(brick); - } + ; } /* Update symlinks even during pauses. diff --git a/kernel/mars_trans_logger.h b/kernel/mars_trans_logger.h index e408e7d9..14d0118a 100644 --- a/kernel/mars_trans_logger.h +++ b/kernel/mars_trans_logger.h @@ -23,6 +23,7 @@ extern int trans_logger_completion_semantics; extern int trans_logger_do_crc; extern int trans_logger_mem_usage; // in KB +extern int trans_logger_max_interleave; extern atomic_t global_mshadow_count; extern atomic64_t global_mshadow_used; diff --git a/kernel/sy_old/mars_proc.c b/kernel/sy_old/mars_proc.c index d126fd51..6d33c98a 100644 --- a/kernel/sy_old/mars_proc.c +++ b/kernel/sy_old/mars_proc.c @@ -213,6 +213,7 @@ ctl_table mars_table[] = { INT_ENTRY("syslog_max_class", brick_say_syslog_max, 0600), INT_ENTRY("delay_say_on_overflow",delay_say_on_overflow, 0600), INT_ENTRY("mapfree_period_sec", mapfree_period_sec, 0600), + INT_ENTRY("logger_max_interleave", trans_logger_max_interleave, 0600), INT_ENTRY("mem_limit_percent", mars_mem_percent, 0600), INT_ENTRY("logger_mem_used_kb", trans_logger_mem_usage, 0400), INT_ENTRY("mem_used_raw_kb", brick_global_block_used,0400),