mirror of https://github.com/schoebel/mars
1331 lines
31 KiB
C
1331 lines
31 KiB
C
/*
|
|
* MARS Long Distance Replication Software
|
|
*
|
|
* This file is part of MARS project: http://schoebel.github.io/mars/
|
|
*
|
|
* Copyright (C) 2010-2014 Thomas Schoebel-Theuer
|
|
* Copyright (C) 2011-2014 1&1 Internet AG
|
|
*
|
|
* This program is free software; you can redistribute it and/or modify
|
|
* it under the terms of the GNU General Public License as published by
|
|
* the Free Software Foundation; either version 2 of the License, or
|
|
* (at your option) any later version.
|
|
*
|
|
* This program is distributed in the hope that it will be useful,
|
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
* GNU General Public License for more details.
|
|
*
|
|
* You should have received a copy of the GNU General Public License along
|
|
* with this program; if not, write to the Free Software Foundation, Inc.,
|
|
* 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
|
*/
|
|
|
|
|
|
/* Qio brick = adaptor between the MARS brick intrastructure
|
|
* and the Linux-internal kernel interfaces.
|
|
*
|
|
* The low-level part is in lib_qio_*.[ch]
|
|
*
|
|
* Uses Queued IO, replacing the historic aio brick type.
|
|
* Now everything lives in kernel space, avoiding __user pointers.
|
|
*
|
|
* MARS needs non-aligned IO supporting byte addresses in
|
|
* persistent /mars. Just working on page offsets is not sufficient,
|
|
* due to strategic reasons which are off topic here.
|
|
*
|
|
* Via iterators, we are rather close on top of the page cache.
|
|
*/
|
|
|
|
//#define BRICK_DEBUGGING
|
|
//#define MARS_DEBUGGING
|
|
//#define IO_DEBUGGING
|
|
//#define STAT_DEBUGGING
|
|
|
|
#include <linux/kernel.h>
|
|
#include <linux/module.h>
|
|
#include <linux/string.h>
|
|
|
|
#include "brick_wait.h"
|
|
#include "mars.h"
|
|
|
|
#ifdef ENABLE_MARS_QIO
|
|
|
|
/* ///////////////////////// own type definitions //////////////////////// */
|
|
|
|
#include "mars_qio.h"
|
|
|
|
/* ///////////////////// thresholds and timing reports //////////////////// */
|
|
|
|
/* 0 = infinite
|
|
* >0 = expire hanging requests after seconds.
|
|
*/
|
|
int mars_qio_hang_timeout_s = 30;
|
|
|
|
/* 0 = expire requests individually, each by each.
|
|
* certain hardward may create holes due to this.
|
|
* 1 = try to workaround hardware problems (e.g. holes), but no guarantee.
|
|
* ask the hardware vendor.
|
|
*/
|
|
int mars_qio_workaround_hw_problems = 0;
|
|
|
|
/* We only need the threshold checks for overall IO latency.
|
|
* This is useful for
|
|
* (a) monitoring of any hardware problems,
|
|
* or overloads by queueing, etc.
|
|
* A simple use case is very slow (old) hardware, but
|
|
* some people believe that it could work miracles.
|
|
* Here is an information source, e.g. for sysadmin
|
|
* reports to the management.
|
|
* (b) global banning, protecting against _mass overload_
|
|
* caused by too many IO requests fired onto
|
|
* /dev/mars/$too_many_resources or similarly.
|
|
* Typically, this happens on _unplanned_ DDOS attacks,
|
|
* and some other horror scenarios.
|
|
* All I can do is giving you some hints, and the rest
|
|
* needs to be done by senior sysadmins.
|
|
*
|
|
* Separate latency checks for the submission path could be added,
|
|
* but left for now.
|
|
*
|
|
* Anyone misconfiguring a low-level latency (e.g. via
|
|
* nr_requests parameters and/or some IO scheduler properties)
|
|
* will have to check any effects on her own.
|
|
*/
|
|
struct threshold qio_io_threshold[2] = {
|
|
[0] = {
|
|
.thr_ban = &mars_global_ban,
|
|
.thr_parent = &global_io_threshold,
|
|
.thr_limit = QIO_IO_R_MAX_LATENCY,
|
|
.thr_factor = 100,
|
|
.thr_plus = 0,
|
|
},
|
|
[1] = {
|
|
.thr_ban = &mars_global_ban,
|
|
.thr_parent = &global_io_threshold,
|
|
.thr_limit = QIO_IO_W_MAX_LATENCY,
|
|
.thr_factor = 100,
|
|
.thr_plus = 0,
|
|
},
|
|
};
|
|
|
|
#define MAX_REQUEUE 32
|
|
|
|
#define QIO_MAX_JIFFIES 2
|
|
|
|
/* ///////////////////// lowlevel interface to kernel //////////////////// */
|
|
|
|
/* Intended for long-term maintenace, e.g. adaptations to
|
|
* better locking methods etc.
|
|
*/
|
|
static inline
|
|
void qio_lock(struct qio_anchors *anch)
|
|
{
|
|
mutex_lock(&anch->prio_mutex);
|
|
}
|
|
|
|
static inline
|
|
void qio_unlock(struct qio_anchors *anch)
|
|
{
|
|
mutex_unlock(&anch->prio_mutex);
|
|
}
|
|
|
|
static inline
|
|
void _enqueue_mref_a(struct qio_anchors *anch,
|
|
struct qio_mref_aspect *mref_a)
|
|
{
|
|
qio_lock(anch);
|
|
list_del(&mref_a->io_head);
|
|
list_add_tail(&mref_a->io_head, &anch->submitted_list);
|
|
qio_unlock(anch);
|
|
}
|
|
|
|
static inline
|
|
struct qio_mref_aspect *_dequeue_mref_a(struct qio_anchors *anch)
|
|
{
|
|
struct qio_mref_aspect *mref_a = NULL;
|
|
|
|
qio_lock(anch);
|
|
if (!list_empty(&anch->submitted_list)) {
|
|
struct list_head *tmp = anch->submitted_list.next;
|
|
|
|
list_del_init(tmp);
|
|
mref_a = container_of(tmp, struct qio_mref_aspect, io_head);
|
|
}
|
|
qio_unlock(anch);
|
|
|
|
return mref_a;
|
|
}
|
|
|
|
static inline
|
|
void _check_mref_a(struct qio_anchors *anch,
|
|
struct qio_mref_aspect *mref_a,
|
|
int err)
|
|
{
|
|
qio_lock(anch);
|
|
if (unlikely(READ_ONCE(mref_a->has_expired))) {
|
|
MARS_DBG("QIO %p had expired, err=%d\n",
|
|
mref_a, err);
|
|
} else if (unlikely(!list_empty(&mref_a->io_head))) {
|
|
list_del_init(&mref_a->io_head);
|
|
}
|
|
qio_unlock(anch);
|
|
}
|
|
|
|
|
|
static
|
|
int __qio_wait_for_kernel(struct qio_brick *brick, struct qio_mref_aspect *mref_a)
|
|
{
|
|
struct file *file = brick->file;
|
|
struct qio_anchors *anch = mref_a->anch;
|
|
int err;
|
|
|
|
err =
|
|
(qio_rw_operations.qio_rw_wait)(file,
|
|
mref_a->is_write,
|
|
mref_a->use_nowait,
|
|
&mref_a->qio_rw);
|
|
|
|
_check_mref_a(anch, mref_a, err);
|
|
|
|
return err;
|
|
}
|
|
|
|
static
|
|
int __qio_submit_to_kernel(struct qio_brick *brick,
|
|
struct qio_mref_aspect *mref_a,
|
|
bool nowait)
|
|
{
|
|
struct mref_object *mref = mref_a->object;
|
|
struct file *file = brick->file;
|
|
bool is_write;
|
|
int status = -EBADR;
|
|
|
|
#ifdef CONFIG_MARS_DEBUG
|
|
CHECK_PTR(brick, done);
|
|
CHECK_PTR(mref_a, done);
|
|
CHECK_PTR(mref, done);
|
|
CHECK_PTR(mref->ref_data, done);
|
|
if (unlikely(mref->ref_len <= 0)) {
|
|
MARS_ERR("invalid ref_len=%d at ref_pos=%lld is_active=%d is_write=%d\n",
|
|
mref->ref_len, mref->ref_pos,
|
|
READ_ONCE(mref_a->is_active),
|
|
READ_ONCE(mref_a->is_write));
|
|
}
|
|
#endif
|
|
|
|
if (unlikely(READ_ONCE(mref_a->is_active))) {
|
|
status = -EINVAL;
|
|
goto done;
|
|
}
|
|
is_write = mref_a->is_write;
|
|
WRITE_ONCE(mref_a->is_active, true);
|
|
if (is_write) {
|
|
status =
|
|
(qio_rw_operations.qio_write_start)(
|
|
file,
|
|
mref->ref_data,
|
|
mref->ref_len,
|
|
&mref->ref_pos,
|
|
&mref_a->qio_rw,
|
|
!nowait);
|
|
} else {
|
|
status =
|
|
(qio_rw_operations.qio_read_start)(
|
|
file,
|
|
mref->ref_data,
|
|
mref->ref_len,
|
|
&mref->ref_pos,
|
|
&mref_a->qio_rw,
|
|
nowait);
|
|
}
|
|
done:
|
|
return status;
|
|
}
|
|
|
|
/* ///////////////////////// own helper functions //////////////////////// */
|
|
|
|
static
|
|
loff_t get_total_size(struct qio_brick *brick)
|
|
{
|
|
loff_t known_pos;
|
|
|
|
if (unlikely(!brick->mf))
|
|
return 0;
|
|
|
|
/* Workaround for races in the page cache.
|
|
* It appears that concurrent reads and writes seem to
|
|
* result in inconsistent reads in some very rare cases, due to
|
|
* races. Sometimes, the inode claims that the file has been already
|
|
* appended by a write operation, but the data has not actually hit
|
|
* the page cache, such that a concurrent read gets NULL blocks.
|
|
* AFAICS this type of race is not avoidable, whatever I try to do.
|
|
* It looks like a dilemma or trilemma.
|
|
* The i_size update cannot be fully atomic with respect to
|
|
* the semantics of the underlying disk / hardware behaviour.
|
|
* MARS tries to do its best for compensation of power outages
|
|
* ocurring at any imaginable point in time.
|
|
* However, I cannot work miracles. I just try to give my best.
|
|
*/
|
|
known_pos = mf_dirty_length(brick->mf, DIRTY_COMPLETING);
|
|
return known_pos;
|
|
}
|
|
|
|
static
|
|
void cross_eof(struct qio_brick *brick,
|
|
struct qio_mref_aspect *mref_a,
|
|
struct mref_object *mref)
|
|
{
|
|
int ref_len;
|
|
loff_t end_pos;
|
|
loff_t eof_pos;
|
|
loff_t usable_len;
|
|
|
|
ref_len = mref->ref_len;
|
|
if (unlikely(ref_len <= 0)) {
|
|
/* Nothing to do here */
|
|
return;
|
|
}
|
|
end_pos = mref->ref_pos + (loff_t)ref_len;
|
|
/* Re-get current size
|
|
*/
|
|
eof_pos = get_total_size(brick);
|
|
mref->ref_total_size = eof_pos;
|
|
|
|
if (end_pos <= eof_pos)
|
|
return;
|
|
|
|
/* Shorten reads crossing EOF,
|
|
* but not when starting exactly at EOF
|
|
* for pipe-like semantics.
|
|
*/
|
|
usable_len = eof_pos - mref->ref_pos;
|
|
if (usable_len > 0 && usable_len < INT_MAX && mref->ref_len > usable_len)
|
|
mref->ref_len = usable_len;
|
|
}
|
|
|
|
static
|
|
int _qio_ref_get(struct qio_brick *brick, struct qio_mref_aspect *mref_a)
|
|
{
|
|
struct mref_object *mref = mref_a->object;
|
|
loff_t total_size;
|
|
int len;
|
|
bool may_write;
|
|
|
|
if (unlikely(mref->ref_len <= 0)) {
|
|
MARS_ERR("bad ref_len=%d\n", mref->ref_len);
|
|
return -EILSEQ;
|
|
}
|
|
if (unlikely(!brick->mf))
|
|
return -ENOENT;
|
|
|
|
total_size = get_total_size(brick);
|
|
if (unlikely(total_size < 0)) {
|
|
return total_size;
|
|
}
|
|
mref->ref_total_size = total_size;
|
|
|
|
if (mref->ref_initialized) {
|
|
_mref_get(mref);
|
|
return mref->ref_len;
|
|
}
|
|
|
|
/* Check for reads crossing the EOF boundary (special case)
|
|
*/
|
|
may_write = (mref->ref_flags & (MREF_WRITE | MREF_MAY_WRITE)) != 0;
|
|
if (!may_write) {
|
|
cross_eof(brick, mref_a, mref);
|
|
if (unlikely(mref->ref_len <= 0)) {
|
|
/* Exactly on EOF */
|
|
return 0;
|
|
}
|
|
}
|
|
|
|
/* Buffered IO.
|
|
*/
|
|
len = mref->ref_len;
|
|
if (!mref->ref_data) {
|
|
mref_a->alloc_len = len;
|
|
mref_a->do_dealloc = true;
|
|
mref->ref_data =
|
|
brick_block_alloc(mref->ref_pos, len);
|
|
}
|
|
|
|
_mref_get_first(mref);
|
|
return len;
|
|
}
|
|
|
|
static
|
|
void _qio_ref_put(struct qio_brick *brick, struct qio_mref_aspect *mref_a)
|
|
{
|
|
struct mref_object *mref = mref_a->object;
|
|
|
|
#ifdef CONFIG_MARS_DEBUG
|
|
CHECK_PTR(mref_a, done);
|
|
CHECK_PTR(mref, done);
|
|
#endif
|
|
if (!_mref_put(mref))
|
|
goto done;
|
|
|
|
if (brick->mf) {
|
|
struct file *file = brick->mf->mf_filp;
|
|
|
|
#ifdef CONFIG_MARS_DEBUG
|
|
CHECK_PTR(file, done);
|
|
#endif
|
|
if (file->f_mapping && file->f_mapping->host) {
|
|
mref->ref_total_size = get_total_size(brick);
|
|
}
|
|
}
|
|
|
|
if (mref_a->do_dealloc) {
|
|
brick_block_free(mref->ref_data, mref_a->alloc_len);
|
|
mref->ref_data = NULL;
|
|
}
|
|
qio_free_mref(mref);
|
|
|
|
done:
|
|
return;
|
|
}
|
|
|
|
static
|
|
void _qio_complete_mref_a(struct qio_brick *brick, struct qio_mref_aspect *mref_a, int err)
|
|
{
|
|
struct mref_object *mref;
|
|
struct lamport_time latency;
|
|
long long latency_ns;
|
|
bool was_write;
|
|
|
|
mref = mref_a->object;
|
|
CHECK_PTR(mref, fatal);
|
|
|
|
if (unlikely(err < 0)) {
|
|
struct lamport_time duration;
|
|
struct qio_anchors *anch;
|
|
bool list_member = false;
|
|
|
|
anch = mref_a->anch;
|
|
if (anch) {
|
|
qio_lock(anch);
|
|
if (unlikely(!list_empty(&mref_a->io_head))) {
|
|
list_del_init(&mref_a->io_head);
|
|
list_member = true;
|
|
}
|
|
qio_unlock(anch);
|
|
}
|
|
if (err == -EAGAIN)
|
|
goto no_report;
|
|
|
|
duration = lamport_time_sub(mref_a->completion_stamp,
|
|
mref_a->started_stamp);
|
|
|
|
MARS_ERR("QIO error %d memb=%d duration=%lld.%09lu at pos=%lld len=%d (mref=%p ref_data=%p)\n",
|
|
err, list_member,
|
|
duration.tv_sec, duration.tv_nsec,
|
|
mref->ref_pos, mref->ref_len, mref, mref->ref_data);
|
|
} else {
|
|
mref_checksum(mref);
|
|
mref->ref_flags |= MREF_UPTODATE;
|
|
}
|
|
|
|
no_report:
|
|
#ifdef CONFIG_MARS_DEBUG
|
|
CHECK_PTR_NULL(brick->mf, skip_mf);
|
|
#endif
|
|
was_write = mref_a->is_write;
|
|
if (was_write && err >= 0) {
|
|
mf_dirty_append(brick->mf, DIRTY_COMPLETING,
|
|
mref->ref_pos + mref->ref_len);
|
|
}
|
|
mapfree_set(brick->mf,
|
|
mref->ref_pos,
|
|
mref->ref_pos + mref->ref_len);
|
|
if (was_write && err >= 0) {
|
|
/* Needs to be done before callback, which might modify
|
|
* mref->.+
|
|
*/
|
|
mf_dirty_append(brick->mf, DIRTY_FINISHED,
|
|
mref->ref_pos + mref->ref_len);
|
|
}
|
|
|
|
#ifdef CONFIG_MARS_DEBUG
|
|
skip_mf:
|
|
#endif
|
|
latency = lamport_time_sub(mref_a->dequeue_stamp, mref_a->started_stamp);
|
|
latency_ns = lamport_time_to_ns(&latency);
|
|
threshold_check(&qio_io_threshold[was_write], latency_ns);
|
|
|
|
#ifdef CONFIG_MARS_DEBUG
|
|
brick->complet_pos = mref->ref_pos;
|
|
brick->complet_len = mref->ref_len;
|
|
#endif
|
|
if (err < 0)
|
|
brick->error = err;
|
|
|
|
CHECKED_CALLBACK(mref, err, callback_err);
|
|
|
|
done:
|
|
if (was_write) {
|
|
atomic_dec(&brick->flying_writes);
|
|
} else {
|
|
atomic_dec(&brick->flying_reads);
|
|
}
|
|
_qio_ref_put(brick, mref_a);
|
|
atomic_dec(&mars_global_io_flying);
|
|
return;
|
|
|
|
callback_err:
|
|
MARS_ERR("QIO %p callback err=%d\n",
|
|
brick, err);
|
|
goto done;
|
|
fatal:
|
|
MARS_FAT("bad mref pointer %p, giving up...\n",
|
|
mref);
|
|
}
|
|
|
|
static
|
|
bool now_expired(struct qio_mref_aspect *mref_a,
|
|
struct lamport_time *last_hanging_stamp,
|
|
struct lamport_time *now)
|
|
{
|
|
int touched_s;
|
|
int latency_s;
|
|
|
|
if (!mars_qio_hang_timeout_s)
|
|
return false;
|
|
|
|
touched_s = mref_a->dequeue_stamp.tv_sec;
|
|
if (!touched_s)
|
|
touched_s = mref_a->started_stamp.tv_sec;
|
|
|
|
latency_s =
|
|
last_hanging_stamp->tv_sec - touched_s;
|
|
if (latency_s > mars_qio_hang_timeout_s)
|
|
return true;
|
|
|
|
latency_s =
|
|
now->tv_sec - touched_s;
|
|
if (latency_s > mars_qio_hang_timeout_s * 2)
|
|
return true;
|
|
|
|
return false;
|
|
}
|
|
|
|
static
|
|
void _qio_complete_all_hanging(struct qio_anchors *anch,
|
|
struct lamport_time *hanging_stamp,
|
|
int err)
|
|
{
|
|
struct lamport_time now;
|
|
struct qio_brick *brick = anch->brick;
|
|
struct list_head *io_list = &anch->submitted_list;
|
|
struct list_head *tmp;
|
|
|
|
get_real_lamport(&now);
|
|
|
|
restart:
|
|
barrier();
|
|
qio_lock(anch);
|
|
/* Keep the original order, as best as possible */
|
|
tmp = io_list->next;
|
|
while (tmp != io_list) {
|
|
struct list_head *old_tmp;
|
|
struct qio_mref_aspect *mref_a =
|
|
container_of(tmp, struct qio_mref_aspect, io_head);
|
|
|
|
/* check whether the request is old enough for failing */
|
|
if (!now_expired(mref_a, hanging_stamp, &now)) {
|
|
/* Not (yet) expired.
|
|
*/
|
|
tmp = tmp->next;
|
|
brick_yield();
|
|
continue;
|
|
}
|
|
/* Now expired.
|
|
*/
|
|
old_tmp = tmp;
|
|
if (mars_qio_workaround_hw_problems == 1) {
|
|
/* Do not give up the lock.
|
|
* This locks out concurrent submitters.
|
|
* We are trying to survive, not to win
|
|
* any benchmarks.
|
|
* Although this is a CPU burner, we want
|
|
* to do our best by avoiding inode updates
|
|
* due to "succesful" callbacks (whether the
|
|
* physical completion is strict, or maybe
|
|
* corresponds to whatever).
|
|
* Advanced sysadmins may not only activate
|
|
* this workaround only when appropriate, but may
|
|
* also abort this crude operational emergency via
|
|
* "echo 0 > /proc/sys/mars/..."
|
|
*/
|
|
tmp = tmp->next;
|
|
list_del_init(old_tmp);
|
|
_qio_complete_mref_a(brick, mref_a, err);
|
|
brick_yield();
|
|
continue;
|
|
}
|
|
|
|
/* _We_ have won the race, so any parallel or successive
|
|
* interrupts (resp callbacks) should not believe
|
|
* they had won.
|
|
*/
|
|
WRITE_ONCE(mref_a->has_expired, true);
|
|
tmp = tmp->next;
|
|
list_del_init(old_tmp);
|
|
qio_unlock(anch);
|
|
|
|
_qio_complete_mref_a(brick, mref_a, err);
|
|
|
|
/* Well, repeat with the next candidate.
|
|
* IMPORTANT: the completion order may be non-ordered
|
|
* by hardware properties.
|
|
* We cannot guarantee that no holes may occur.
|
|
* Sorry, but operationality takes preference here.
|
|
* You may increase mars_qio_hang_timeout_s to "infinity"
|
|
* if you dislike the hardware behaviour.
|
|
*/
|
|
brick_yield();
|
|
get_real_lamport(&now);
|
|
goto restart;
|
|
}
|
|
qio_unlock(anch);
|
|
}
|
|
|
|
static
|
|
void qio_complete_all_hanging(struct qio_anchors *anch, int err)
|
|
{
|
|
struct qio_brick *brick = anch->brick;
|
|
|
|
_qio_complete_all_hanging(anch, &brick->last_hanging_stamp, err);
|
|
}
|
|
|
|
static
|
|
void _qio_ref_io(struct qio_brick *brick, struct qio_mref_aspect *mref_a)
|
|
{
|
|
struct mref_object *mref = mref_a->object;
|
|
struct qio_anchors *anch;
|
|
struct lamport_time now;
|
|
bool is_write;
|
|
int err = -EINVAL;
|
|
|
|
get_real_lamport(&now);
|
|
mref_a->started_stamp = now;
|
|
brick->last_started_stamp = now;
|
|
|
|
is_write = (mref->ref_flags & MREF_WRITE);
|
|
mref_a->is_write = is_write;
|
|
if (is_write)
|
|
mref_a->use_nowait = true;
|
|
|
|
/* statistics */
|
|
atomic_inc(&mars_global_io_flying);
|
|
if (is_write) {
|
|
atomic_inc(&brick->flying_writes);
|
|
} else {
|
|
atomic_inc(&brick->flying_reads);
|
|
}
|
|
|
|
if (unlikely(!brick->mf || !brick->mf->mf_filp)) {
|
|
goto fail_fast;
|
|
}
|
|
|
|
mapfree_set(brick->mf, mref->ref_pos, -1);
|
|
|
|
/* Check for reads crossing the EOF boundary (special case)
|
|
*/
|
|
if (!is_write)
|
|
cross_eof(brick, mref_a, mref);
|
|
|
|
/* determine the right queue, depending on read/write */
|
|
anch = &brick->thread_anch[is_write & 1];
|
|
mref_a->anch = anch;
|
|
|
|
#ifdef CONFIG_MARS_DEBUG
|
|
brick->submit_pos = mref->ref_pos;
|
|
brick->submit_len = mref->ref_len;
|
|
#endif
|
|
|
|
/* Hopefully, the kernel-side submission path
|
|
* will be fast.
|
|
* There might be exceptions, such as delays from
|
|
* nr_requests and siblings.
|
|
* TODO: react more sophisticated on suchalike.
|
|
*/
|
|
err = __qio_submit_to_kernel(brick, mref_a, mref_a->use_nowait);
|
|
if (unlikely(err < 0)) {
|
|
/* Delegate to the callback thread:
|
|
* (a) Do no hinder parallelism of submission
|
|
* by a sequential bottleneck.
|
|
* (b) EGAIN may occur, and it may require
|
|
* a wait queue anyway.
|
|
* (c) prevent resource deadlocks.
|
|
* Our QIO queues may grow higher than
|
|
* other queues, like TCP window queues, etc.
|
|
*/
|
|
mref_a->qio_error = err;
|
|
}
|
|
|
|
/* Submission has succeeded.
|
|
* Assume that hardware is operational.
|
|
*/
|
|
brick->last_hanging_stamp = now;
|
|
|
|
/* Do not expose to worker threads earlier.
|
|
*/
|
|
_enqueue_mref_a(anch, mref_a);
|
|
|
|
WRITE_ONCE(anch->should_wake_now, true);
|
|
brick_wake_smp(&anch->event);
|
|
|
|
done:
|
|
return;
|
|
|
|
/* Not yet enqueued.
|
|
* Complete directly by reporting the fatal error.
|
|
* Do not use in masses.
|
|
*/
|
|
fail_fast:
|
|
_qio_complete_mref_a(brick, mref_a, err);
|
|
goto done;
|
|
}
|
|
|
|
/* ////////////////// qio thread thingies ////////////////// */
|
|
|
|
static inline
|
|
bool _qio_thread_should_run(struct qio_anchors *anch,
|
|
struct qio_brick *brick)
|
|
{
|
|
return
|
|
READ_ONCE(anch->should_wake_now) ||
|
|
!READ_ONCE(anch->should_terminate) ||
|
|
atomic_read(&brick->flying_reads) +
|
|
atomic_read(&brick->flying_writes) > 0;
|
|
}
|
|
|
|
static
|
|
void check_hanging_requests(struct qio_anchors *anch,
|
|
struct qio_brick *brick)
|
|
{
|
|
int latency_s;
|
|
|
|
if (!mars_qio_hang_timeout_s ||
|
|
!brick->last_completion_stamp.tv_sec)
|
|
return;
|
|
|
|
latency_s =
|
|
brick->last_completion_stamp.tv_sec -
|
|
brick->last_started_stamp.tv_sec;
|
|
if (latency_s <= mars_qio_hang_timeout_s)
|
|
return;
|
|
|
|
/* check once again, realtime */
|
|
if (!brick->last_hanging_stamp.tv_sec)
|
|
get_real_lamport(&brick->last_hanging_stamp);
|
|
|
|
qio_complete_all_hanging(anch, -ETIME);
|
|
}
|
|
|
|
static
|
|
int qio_thread(void *private)
|
|
{
|
|
struct qio_anchors *anch = private;
|
|
struct qio_brick *brick = anch->brick;
|
|
unsigned long sleep_jiffies = 0;
|
|
unsigned long last_jiffies = 0;
|
|
|
|
MARS_DBG("QIO thread %d has started on '%s'\n",
|
|
anch->anch_prio, brick->brick_path);
|
|
|
|
while (_qio_thread_should_run(anch, brick)) {
|
|
struct qio_mref_aspect *mref_a;
|
|
struct mref_object *mref;
|
|
struct lamport_time now;
|
|
int err;
|
|
|
|
mref_a = _dequeue_mref_a(anch);
|
|
if (!mref_a) {
|
|
bool got_wake_signal;
|
|
|
|
if (!brick->power.button) {
|
|
check_hanging_requests(anch, brick);
|
|
}
|
|
got_wake_signal = false;
|
|
brick_wait_smp(
|
|
anch->event,
|
|
(got_wake_signal = READ_ONCE(anch->should_wake_now)) ||
|
|
!_qio_thread_should_run(anch, brick),
|
|
sleep_jiffies);
|
|
WRITE_ONCE(anch->should_wake_now, false);
|
|
|
|
/* Fast path, controlled via flag.
|
|
* See the lectures from Dijkstra.
|
|
*/
|
|
if (got_wake_signal) {
|
|
cond_resched();
|
|
continue;
|
|
}
|
|
|
|
/* The page cache is very fast, in terms of reaction delay.
|
|
* Our first few slow-path rounds should work as a polling
|
|
* loop.
|
|
* After a while, we should pause for at least 1 jiffies,
|
|
* in order to avoid a CPU burner.
|
|
* Much more than 1 jiffies might lead to unnecessary delays.
|
|
* So we try some compromize, as explained by Dijkstra
|
|
* and his grand sons in spirit.
|
|
* When in doubt: refer to the Dekker algorithm, and measure
|
|
* its performance upon massive contention.
|
|
*/
|
|
if (sleep_jiffies < QIO_MAX_JIFFIES) {
|
|
unsigned long now_jiffies = jiffies;
|
|
|
|
if (!last_jiffies)
|
|
last_jiffies = now_jiffies;
|
|
else if (now_jiffies > last_jiffies)
|
|
sleep_jiffies++;
|
|
cond_resched();
|
|
}
|
|
continue;
|
|
}
|
|
last_jiffies = jiffies;
|
|
sleep_jiffies = 0;
|
|
|
|
get_real_lamport(&now);
|
|
mref_a->dequeue_stamp = now;
|
|
#ifdef CONFIG_MARS_DEBUG
|
|
brick->last_dequeue_stamp = now;
|
|
#endif
|
|
|
|
mref = mref_a->object;
|
|
mapfree_set(brick->mf, mref->ref_pos, -1);
|
|
|
|
err = mref_a->qio_error;
|
|
if (err == -EAGAIN &&
|
|
!READ_ONCE(anch->should_terminate)) {
|
|
/* Re-submit again.
|
|
* This may happen regularly, since the
|
|
* page cache requires us to poll,
|
|
* at least when using wwait for achieving
|
|
* high IO parallelism.
|
|
*/
|
|
WRITE_ONCE(mref_a->is_active, false);
|
|
mref_a->qio_rw.qio_phase = 0;
|
|
err = __qio_submit_to_kernel(brick, mref_a, false);
|
|
mref_a->qio_error = err;
|
|
|
|
/* Polling now over, or should we retry? */
|
|
if (err == -EAGAIN &&
|
|
mref_a->nr_requeue++ <= MAX_REQUEUE) {
|
|
/* Re-enqueue for a limited number of
|
|
* rounds.
|
|
*/
|
|
_enqueue_mref_a(anch, mref_a);
|
|
brick_yield();
|
|
continue;
|
|
}
|
|
}
|
|
if (unlikely(err < 0)) {
|
|
/* We need to give up directly.
|
|
* This one never hit the page cache, AFAIC.
|
|
*/
|
|
goto do_complete;
|
|
}
|
|
|
|
err =
|
|
__qio_wait_for_kernel(brick, mref_a);
|
|
|
|
get_real_lamport(&now);
|
|
mref_a->completion_stamp = now;
|
|
do_complete:
|
|
brick->last_hanging_stamp = now;
|
|
|
|
_qio_complete_mref_a(brick, mref_a, err);
|
|
}
|
|
|
|
MARS_DBG("QIO thread %d stopping on '%s'\n",
|
|
anch->anch_prio, brick->brick_path);
|
|
|
|
WRITE_ONCE(anch->has_terminated, true);
|
|
return 0;
|
|
}
|
|
|
|
/* ////////////////// own brick / input / output operations ////////////////// */
|
|
|
|
static
|
|
atomic_t thread_series_nr = ATOMIC_INIT(0);
|
|
|
|
|
|
static
|
|
bool qio_switch_on(struct qio_brick *brick)
|
|
{
|
|
const char *path = brick->brick_path;
|
|
struct mapfree_info *mf;
|
|
struct file *file;
|
|
int flags = O_RDWR | O_LARGEFILE;
|
|
int i;
|
|
int series_nr;
|
|
char class = 'A';
|
|
bool fully_operational;
|
|
|
|
mf = brick->mf;
|
|
if (mf) {
|
|
file = mf->mf_filp;
|
|
if (file)
|
|
goto make_threads;
|
|
}
|
|
|
|
if (brick->o_creat) {
|
|
flags |= (O_NOFOLLOW | O_CREAT);
|
|
MARS_DBG("using O_CREAT on %s\n", path);
|
|
}
|
|
|
|
brick->error = 0;
|
|
mf = mapfree_get(path, flags, &brick->error);
|
|
if (unlikely(!mf)) {
|
|
MARS_ERR("QIO could not open file='%s' flags=%d error=%d\n",
|
|
path, flags, brick->error);
|
|
return false;
|
|
}
|
|
file = mf->mf_filp;
|
|
if (unlikely(!file)) {
|
|
MARS_ERR("QIO file='%s' flags=%d invalid filp\n",
|
|
path, flags);
|
|
mapfree_put(mf);
|
|
return false;
|
|
}
|
|
brick->mf = mf;
|
|
brick->file = file;
|
|
|
|
series_nr = atomic_inc_return(&thread_series_nr);
|
|
if (brick->last_submitted_stamp.tv_sec) {
|
|
brick->last_hanging_stamp.tv_sec = 0;
|
|
brick->last_submitted_stamp.tv_sec = 0;
|
|
brick->last_completion_stamp.tv_sec = 0;
|
|
}
|
|
|
|
make_threads:
|
|
fully_operational = true;
|
|
for (i = 0; i < 2; i++) {
|
|
struct qio_anchors *anch = &brick->thread_anch[i];
|
|
|
|
/* Thread had been already started, and
|
|
* and may be currently stopping.
|
|
*/
|
|
if (anch->thread)
|
|
brick_wake_smp(&anch->event);
|
|
|
|
/* Check for races against old threads.
|
|
* Switches may change more frequently than any
|
|
* slowed-down thread.
|
|
* NB: we try to compensate furious operational
|
|
* problems as seen by sysadmins (e.g. caused
|
|
* by semi-defective hardware etc).
|
|
* Thus we use classical kthreads here, no
|
|
* sophisticated lowlevel technology.
|
|
*/
|
|
if (READ_ONCE(anch->should_terminate) &&
|
|
!READ_ONCE(anch->has_terminated)) {
|
|
fully_operational = false;
|
|
/* retry after the old thread has actually terminated */
|
|
continue;
|
|
}
|
|
if (anch->thread)
|
|
continue;
|
|
|
|
WRITE_ONCE(anch->should_terminate, false);
|
|
WRITE_ONCE(anch->has_terminated, false);
|
|
anch->thread =
|
|
brick_thread_create(qio_thread, anch,
|
|
"mars_qio%c%d",
|
|
class + i,
|
|
series_nr);
|
|
if (unlikely(!anch->thread)) {
|
|
MARS_ERR("cannot create QIO thread %c\n",
|
|
class + i);
|
|
/* Retry thread creation next time */
|
|
fully_operational = false;
|
|
}
|
|
}
|
|
return fully_operational;
|
|
}
|
|
|
|
static
|
|
bool qio_switch_off(struct qio_brick *brick)
|
|
{
|
|
struct mapfree_info *mf;
|
|
int nr_flying;
|
|
int nr_running;
|
|
int i;
|
|
|
|
retry:
|
|
mb();
|
|
nr_running = 0;
|
|
nr_flying =
|
|
atomic_read(&brick->flying_reads) +
|
|
atomic_read(&brick->flying_writes);
|
|
for (i = 0; i < 2; i++) {
|
|
struct qio_anchors *anch = &brick->thread_anch[i];
|
|
bool is_dirty;
|
|
|
|
WRITE_ONCE(anch->should_terminate, true);
|
|
if (!anch->thread)
|
|
continue;
|
|
nr_running++;
|
|
brick_wake_smp(&anch->event);
|
|
if (nr_flying > 0)
|
|
continue;
|
|
qio_lock(anch);
|
|
is_dirty =
|
|
!list_empty(&anch->submitted_list);
|
|
qio_unlock(anch);
|
|
if (is_dirty)
|
|
continue;
|
|
if (!READ_ONCE(anch->has_terminated)) {
|
|
continue;
|
|
}
|
|
brick_thread_stop(anch->thread);
|
|
anch->thread = NULL;
|
|
}
|
|
if (nr_flying > 0) {
|
|
MARS_DBG("QIO %d requests are flying\n",
|
|
nr_flying);
|
|
/* Not yet fully off */
|
|
brick_yield();
|
|
goto retry;
|
|
}
|
|
if (nr_running) {
|
|
MARS_DBG("QIO threads %d not yet terminated\n",
|
|
nr_running);
|
|
/* Not yet fully off */
|
|
brick_yield();
|
|
goto retry;
|
|
}
|
|
mf = brick->mf;
|
|
brick->mf = NULL;
|
|
if (mf) {
|
|
MARS_DBG("closing mf=%p filename='%s'\n",
|
|
mf, mf->mf_name);
|
|
mapfree_put(mf);
|
|
}
|
|
/* New we should be fully off */
|
|
return true;
|
|
}
|
|
|
|
static
|
|
int qio_get_info(struct qio_output *output, struct mars_info *info)
|
|
{
|
|
struct qio_brick *brick;
|
|
struct file *file;
|
|
|
|
if (unlikely(!output))
|
|
return -EINVAL;
|
|
|
|
brick = output->brick;
|
|
if (unlikely(!brick ||
|
|
!brick->file))
|
|
return -EINVAL;
|
|
file = brick->file;
|
|
if (unlikely(!file))
|
|
return -EINVAL;
|
|
|
|
if (unlikely(!file->f_mapping ||
|
|
!file->f_mapping->host))
|
|
return -EINVAL;
|
|
|
|
info->tf_align = 1;
|
|
info->tf_min_size = 1;
|
|
info->current_size = get_total_size(brick);
|
|
|
|
MARS_DBG("determined file size = %lld\n", info->current_size);
|
|
return 0;
|
|
}
|
|
|
|
static
|
|
int qio_ref_get(struct qio_output *output, struct mref_object *mref)
|
|
{
|
|
struct qio_brick *brick = output->brick;
|
|
struct qio_mref_aspect *mref_a;
|
|
int res;
|
|
|
|
mref_a = qio_mref_get_aspect(brick, mref);
|
|
res = _qio_ref_get(brick, mref_a);
|
|
return res;
|
|
}
|
|
|
|
static
|
|
void qio_ref_put(struct qio_output *output, struct mref_object *mref)
|
|
{
|
|
struct qio_brick *brick = output->brick;
|
|
struct qio_mref_aspect *mref_a;
|
|
|
|
mref_a = qio_mref_get_aspect(brick, mref);
|
|
_qio_ref_put(brick, mref_a);
|
|
}
|
|
|
|
static
|
|
void qio_ref_io(struct qio_output *output, struct mref_object *mref)
|
|
{
|
|
struct qio_brick *brick = NULL;
|
|
struct qio_mref_aspect *mref_a = NULL;
|
|
|
|
CHECK_PTR(output, fatal);
|
|
brick = output->brick;
|
|
CHECK_PTR(brick, fatal);
|
|
CHECK_PTR(mref, fatal);
|
|
CHECK_PTR(mref->ref_data, fatal);
|
|
|
|
_mref_check(mref);
|
|
|
|
if (unlikely(!brick->power.led_on)) {
|
|
SIMPLE_CALLBACK(mref, -EBADFD);
|
|
return;
|
|
}
|
|
|
|
_mref_get(mref);
|
|
|
|
mref_a = qio_mref_get_aspect(brick, mref);
|
|
CHECK_PTR(mref_a, fatal);
|
|
|
|
_qio_ref_io(brick, mref_a);
|
|
return;
|
|
|
|
fatal:
|
|
MARS_FAT("bad pointer %p %p %p %p (%p), giving up...\n",
|
|
output, brick, mref, mref_a,
|
|
mref ? mref->ref_data : NULL);
|
|
}
|
|
|
|
static
|
|
int qio_switch(struct qio_brick *brick)
|
|
{
|
|
if (brick->power.button) {
|
|
bool success = false;
|
|
|
|
if (brick->power.led_on)
|
|
goto done;
|
|
mars_power_led_off((void *)brick, false);
|
|
/* Intermediate states may happen:
|
|
* !success means: not yet fully on
|
|
*/
|
|
success = qio_switch_on(brick);
|
|
if (success) {
|
|
mars_power_led_on((void *)brick, true);
|
|
}
|
|
} else {
|
|
bool success = false;
|
|
if (brick->power.led_off)
|
|
goto done;
|
|
mars_power_led_on((void *)brick, false);
|
|
/* Intermediate states may happen:
|
|
* !success means: not yet fully off
|
|
*/
|
|
success = qio_switch_off(brick);
|
|
if (success) {
|
|
mars_power_led_off((void *)brick, true);
|
|
mars_remote_trigger(MARS_TRIGGER_LOCAL | MARS_TRIGGER_FROM_REMOTE);
|
|
}
|
|
}
|
|
done:
|
|
return 0;
|
|
}
|
|
|
|
|
|
/* //////////////// informational / statistics /////////////// */
|
|
|
|
static
|
|
char *qio_statistics(struct qio_brick *brick, int verbose)
|
|
{
|
|
char *res = brick_string_alloc(4096);
|
|
if (!res)
|
|
return NULL;
|
|
|
|
snprintf(res, 4095,
|
|
"last started=%lld.%09ld "
|
|
"submitted=%lld.%09ld "
|
|
#ifdef CONFIG_MARS_DEBUG
|
|
"dequeue=%lld.%09ld "
|
|
#endif
|
|
"completed=%lld.%09ld "
|
|
"hanging=%lld.%09ld | "
|
|
"flying reads=%d "
|
|
"writes=%d | "
|
|
#ifdef CONFIG_MARS_DEBUG
|
|
"submit=%lld+%d "
|
|
"complet=%lld+%d "
|
|
#endif
|
|
"error=%d\n",
|
|
brick->last_started_stamp.tv_sec,
|
|
brick->last_started_stamp.tv_nsec,
|
|
brick->last_submitted_stamp.tv_sec,
|
|
brick->last_submitted_stamp.tv_nsec,
|
|
#ifdef CONFIG_MARS_DEBUG
|
|
brick->last_dequeue_stamp.tv_sec,
|
|
brick->last_dequeue_stamp.tv_nsec,
|
|
#endif
|
|
brick->last_completion_stamp.tv_sec,
|
|
brick->last_completion_stamp.tv_nsec,
|
|
brick->last_hanging_stamp.tv_sec,
|
|
brick->last_hanging_stamp.tv_nsec,
|
|
atomic_read(&brick->flying_reads),
|
|
atomic_read(&brick->flying_writes),
|
|
#ifdef CONFIG_MARS_DEBUG
|
|
brick->submit_pos,
|
|
brick->submit_len,
|
|
brick->complet_pos,
|
|
brick->complet_len,
|
|
#endif
|
|
brick->error);
|
|
|
|
return res;
|
|
}
|
|
|
|
static
|
|
void qio_reset_statistics(struct qio_brick *brick)
|
|
{
|
|
}
|
|
|
|
/* //////////////// object / aspect constructors / destructors /////////////// */
|
|
|
|
static
|
|
int qio_mref_aspect_init_fn(struct generic_aspect *_ini)
|
|
{
|
|
struct qio_mref_aspect *ini = (void *)_ini;
|
|
|
|
INIT_LIST_HEAD(&ini->io_head);
|
|
init_qio_rw(&ini->qio_rw);
|
|
return 0;
|
|
}
|
|
|
|
static
|
|
void qio_mref_aspect_exit_fn(struct generic_aspect *_ini)
|
|
{
|
|
struct qio_mref_aspect *ini = (void *)_ini;
|
|
|
|
exit_qio_rw(&ini->qio_rw);
|
|
CHECK_HEAD_EMPTY(&ini->io_head);
|
|
}
|
|
|
|
MARS_MAKE_STATICS(qio);
|
|
|
|
/* ////////////////////// brick constructors / destructors //////////////////// */
|
|
|
|
static
|
|
int qio_brick_construct(struct qio_brick *brick)
|
|
{
|
|
int i;
|
|
|
|
for (i = 0; i < 2; i++) {
|
|
struct qio_anchors *anch = &brick->thread_anch[i];
|
|
|
|
anch->brick = brick;
|
|
anch->anch_prio = i;
|
|
mutex_init(&anch->prio_mutex);
|
|
INIT_LIST_HEAD(&anch->submitted_list);
|
|
init_waitqueue_head(&anch->event);
|
|
}
|
|
return 0;
|
|
}
|
|
|
|
static
|
|
int qio_brick_destruct(struct qio_brick *brick)
|
|
{
|
|
int i;
|
|
|
|
for (i = 0; i < 2; i++) {
|
|
struct qio_anchors *anch = &brick->thread_anch[i];
|
|
|
|
CHECK_HEAD_EMPTY(&anch->submitted_list);
|
|
}
|
|
return 0;
|
|
}
|
|
|
|
static
|
|
int qio_output_construct(struct qio_output *output)
|
|
{
|
|
return 0;
|
|
}
|
|
|
|
static
|
|
int qio_output_destruct(struct qio_output *output)
|
|
{
|
|
return 0;
|
|
}
|
|
|
|
/* ///////////////////////// static structs //////////////////////// */
|
|
|
|
static
|
|
struct qio_brick_ops qio_brick_ops = {
|
|
.brick_switch = qio_switch,
|
|
.brick_statistics = qio_statistics,
|
|
.reset_statistics = qio_reset_statistics,
|
|
};
|
|
|
|
static
|
|
struct qio_output_ops qio_output_ops = {
|
|
.mars_get_info = qio_get_info,
|
|
.mref_get = qio_ref_get,
|
|
.mref_put = qio_ref_put,
|
|
.mref_io = qio_ref_io,
|
|
};
|
|
|
|
const struct qio_input_type qio_input_type = {
|
|
.type_name = "qio_input",
|
|
.input_size = sizeof(struct qio_input),
|
|
};
|
|
|
|
static
|
|
const struct qio_input_type *qio_input_types[] = {
|
|
&qio_input_type,
|
|
};
|
|
|
|
const struct qio_output_type qio_output_type = {
|
|
.type_name = "qio_output",
|
|
.output_size = sizeof(struct qio_output),
|
|
.master_ops = &qio_output_ops,
|
|
.output_construct = &qio_output_construct,
|
|
.output_destruct = &qio_output_destruct,
|
|
};
|
|
|
|
static
|
|
const struct qio_output_type *qio_output_types[] = {
|
|
&qio_output_type,
|
|
};
|
|
|
|
const struct qio_brick_type qio_brick_type = {
|
|
.type_name = "qio_brick",
|
|
.brick_size = sizeof(struct qio_brick),
|
|
.max_inputs = 0,
|
|
.max_outputs = 1,
|
|
.master_ops = &qio_brick_ops,
|
|
.aspect_types = qio_aspect_types,
|
|
.default_input_types = qio_input_types,
|
|
.default_output_types = qio_output_types,
|
|
.brick_construct = &qio_brick_construct,
|
|
.brick_destruct = &qio_brick_destruct,
|
|
};
|
|
EXPORT_SYMBOL_GPL(qio_brick_type);
|
|
|
|
#endif /* ENABLE_MARS_QIO */
|
|
|
|
/* ////////////////// module init stuff ///////////////////////// */
|
|
|
|
int __init init_mars_qio(void)
|
|
{
|
|
#ifdef ENABLE_MARS_QIO
|
|
MARS_INF("init_qio()\n");
|
|
_qio_brick_type = (void *)&qio_brick_type;
|
|
return qio_register_brick_type();
|
|
#else
|
|
return 0;
|
|
#endif
|
|
}
|
|
|
|
void exit_mars_qio(void)
|
|
{
|
|
#ifdef ENABLE_MARS_QIO
|
|
MARS_INF("exit_qio()\n");
|
|
qio_unregister_brick_type();
|
|
#endif
|
|
}
|