/* * MARS Long Distance Replication Software * * This file is part of MARS project: http://schoebel.github.io/mars/ * * Copyright (C) 2010-2014 Thomas Schoebel-Theuer * Copyright (C) 2011-2014 1&1 Internet AG * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License along * with this program; if not, write to the Free Software Foundation, Inc., * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. */ //#define BRICK_DEBUGGING //#define MARS_DEBUGGING //#define IO_DEBUGGING #define USE_VFS_READ #define USE_VFS_WRITE #include #include #include #include #include #include #include #include #include #include #include #include "brick_wait.h" #include "mars.h" ///////////////////////// own type definitions //////////////////////// #include "mars_sio.h" ////////////////// own brick / input / output operations ////////////////// static int sio_ref_get(struct sio_output *output, struct mref_object *mref) { struct file *file; if (unlikely(!output->brick->power.led_on)) return -EBADFD; if (mref->ref_initialized) { _mref_get(mref); return mref->ref_len; } file = output->mf->mf_filp; if (file) { loff_t total_size = i_size_read(file->f_mapping->host); mref->ref_total_size = total_size; /* Only check reads. * Writes behind EOF are always allowed (sparse files) */ if (!(mref->ref_flags & MREF_MAY_WRITE)) { loff_t len = total_size - mref->ref_pos; if (unlikely(len <= 0)) { /* Special case: allow reads starting _exactly_ at EOF when a timeout is specified. */ if (len < 0 || mref->ref_timeout <= 0) { MARS_DBG("ENODATA %lld\n", len); return -ENODATA; } } // Shorten below EOF, but allow special case if (mref->ref_len > len && len > 0) { mref->ref_len = len; } } } /* Buffered IO. */ if (!mref->ref_data) { struct sio_mref_aspect *mref_a = sio_mref_get_aspect(output->brick, mref); if (unlikely(!mref_a)) return -EILSEQ; if (unlikely(mref->ref_len <= 0)) { MARS_ERR("bad ref_len = %d\n", mref->ref_len); return -EBADR; } mref->ref_data = brick_block_alloc(mref->ref_pos, (mref_a->alloc_len = mref->ref_len)); mref_a->do_dealloc = true; //atomic_inc(&output->total_alloc_count); //atomic_inc(&output->alloc_count); } _mref_get_first(mref); return mref->ref_len; } static void sio_ref_put(struct sio_output *output, struct mref_object *mref) { struct file *file; struct sio_mref_aspect *mref_a; if (!_mref_put(mref)) return; file = output->mf->mf_filp; mref->ref_total_size = i_size_read(file->f_mapping->host); mref_a = sio_mref_get_aspect(output->brick, mref); if (mref_a && mref_a->do_dealloc) { brick_block_free(mref->ref_data, mref_a->alloc_len); //atomic_dec(&output->alloc_count); } sio_free_mref(mref); } #ifndef MARS_HAS_KERNEL_READ // some code borrowed from the loopback driver static int transfer_none(int cmd, struct page *raw_page, unsigned raw_off, void *loop_buf, int size) { void *raw_buf; if (unlikely(!raw_page || !loop_buf)) { MARS_ERR("transfer NULL: %p %p\n", raw_buf, loop_buf); return -EFAULT; } raw_buf = page_address(raw_page) + raw_off; if (cmd == READ) memcpy(loop_buf, raw_buf, size); else memcpy(raw_buf, loop_buf, size); brick_yield(); return 0; } #endif static int write_aops(struct sio_output *output, struct mref_object *mref) { struct file *file = output->mf->mf_filp; loff_t pos = mref->ref_pos; void *data = mref->ref_data; int len = mref->ref_len; int ret = 0; #ifdef MARS_HAS_KERNEL_READ ret = kernel_write(file, data, len, &pos); #else #ifdef USE_VFS_WRITE mm_segment_t oldfs; oldfs = get_fs(); set_fs(KERNEL_DS); ret = vfs_write(file, data, len, &pos); set_fs(oldfs); #else unsigned offset; struct address_space *mapping; if (unlikely(!file)) { MARS_FAT("No FILE\n"); return -ENXIO; } mapping = file->f_mapping; mutex_lock(&mapping->host->i_mutex); offset = pos & ((pgoff_t)PAGE_CACHE_SIZE - 1); while (len > 0) { int transfer_result; unsigned size, copied; struct page *page = NULL; void *fsdata; size = PAGE_CACHE_SIZE - offset; if (size > len) size = len; ret = pagecache_write_begin(file, mapping, pos, size, 0, &page, &fsdata); if (ret) { MARS_ERR("cannot start pagecache_write_begin() error=%d\n", ret); if (ret >= 0) ret = -EINVAL; goto fail; } //file_update_time(file); transfer_result = transfer_none(WRITE, page, offset, data, size); copied = size; if (transfer_result) { MARS_ERR("transfer error %d\n", transfer_result); copied = 0; } ret = pagecache_write_end(file, mapping, pos, size, copied, page, fsdata); if (ret < 0 || ret != copied || transfer_result) { MARS_ERR("write error %d\n", ret); if (ret >= 0) ret = -EINVAL; goto fail; } len -= copied; offset = 0; pos += copied; data += copied; } ret = 0; fail: mutex_unlock(&mapping->host->i_mutex); #if 1 blk_run_address_space(mapping); #endif #endif #endif return ret; } struct cookie_data { struct sio_output *output; struct mref_object *mref; }; #ifndef MARS_HAS_KERNEL_READ static int sio_splice_actor(struct pipe_inode_info *pipe, struct pipe_buffer *buf, struct splice_desc *sd) { struct cookie_data *cookie = sd->u.data; struct mref_object *mref = cookie->mref; struct page *page = buf->page; void *data; int size, ret; ret = buf->ops->confirm(pipe, buf); if (unlikely(ret)) return ret; size = sd->len; if (size > mref->ref_len) size = mref->ref_len; data = mref->ref_data; if (transfer_none(READ, page, buf->offset, data, size)) { MARS_ERR("transfer error\n"); size = -EINVAL; } //flush_dcache_page(p->bvec->bv_page); return size; } static int sio_direct_splice_actor(struct pipe_inode_info *pipe, struct splice_desc *sd) { return __splice_from_pipe(pipe, sd, sio_splice_actor); } #endif static int read_aops(struct sio_output *output, struct mref_object *mref) { loff_t pos = mref->ref_pos; int len = mref->ref_len; int ret; #ifdef MARS_HAS_KERNEL_READ ret = kernel_read(output->mf->mf_filp, mref->ref_data, len, &pos); #else #ifdef USE_VFS_READ mm_segment_t oldfs; (void) sio_direct_splice_actor; // shut up gcc oldfs = get_fs(); set_fs(KERNEL_DS); ret = vfs_read(output->mf->mf_filp, mref->ref_data, len, &pos); set_fs(oldfs); #else struct cookie_data cookie = { .output = output, .mref = mref, }; struct splice_desc sd = { .len = 0, .total_len = len, .flags = 0, .pos = pos, .u.data = &cookie, }; ret = splice_direct_to_actor(output->mf->mf_filp, &sd, sio_direct_splice_actor); #endif #endif if (unlikely(ret < 0)) { MARS_ERR("%p %p status=%d\n", output, mref, ret); } return ret; } static void sync_file(struct sio_output *output) { struct file *file = output->mf->mf_filp; int ret; #if defined(S_BIAS) || (defined(RHEL_MAJOR) && (RHEL_MAJOR < 7)) ret = vfs_fsync(file, file->f_path.dentry, 1); #else ret = vfs_fsync(file, 1); #endif if (unlikely(ret)) { MARS_ERR("syncing pages failed: %d\n", ret); } return; } static void _complete(struct sio_output *output, struct mref_object *mref, int err) { _mref_check(mref); mars_trace(mref, "sio_endio"); if (err < 0) { MARS_ERR("IO error %d at pos=%lld len=%d (mref=%p ref_data=%p)\n", err, mref->ref_pos, mref->ref_len, mref, mref->ref_data); } else { mref_checksum(mref); mref->ref_flags |= MREF_UPTODATE; } #ifdef CONFIG_MARS_DEBUG while (mars_hang_mode & 1) brick_msleep(100); #endif CHECKED_CALLBACK(mref, err, err_found); done: #if 0 if (mref->ref_flags & MREF_WRITE) { atomic_dec(&output->write_count); } else { atomic_dec(&output->read_count); } #endif sio_ref_put(output, mref); atomic_dec(&output->work_count); atomic_dec(&mars_global_io_flying); return; err_found: MARS_FAT("giving up...\n"); goto done; } /* This is called by the threads */ static void _sio_ref_io(struct sio_threadinfo *tinfo, struct mref_object *mref) { struct sio_output *output = tinfo->output; bool barrier = false; int status; _mref_check(mref); atomic_inc(&tinfo->fly_count); if (unlikely(!output->mf || !output->mf->mf_filp)) { status = -EINVAL; goto done; } if (barrier) { MARS_INF("got barrier request\n"); sync_file(output); } if (!(mref->ref_flags & MREF_WRITE)) { status = read_aops(output, mref); } else { mf_dirty_append(output->mf, DIRTY_SUBMITTED, mref->ref_pos + mref->ref_len); status = write_aops(output, mref); if (status >= 0) { if (barrier || output->brick->o_fdsync) sync_file(output); mf_dirty_append(output->mf, DIRTY_COMPLETING, mref->ref_pos + mref->ref_len); } } mapfree_set(output->mf, mref->ref_pos, mref->ref_pos + mref->ref_len); done: if ((mref->ref_flags & MREF_WRITE) && status >= 0) mf_dirty_append(output->mf, DIRTY_FINISHED, mref->ref_pos + mref->ref_len); _complete(output, mref, status); atomic_dec(&tinfo->fly_count); } /* This is called from outside */ static void sio_ref_io(struct sio_output *output, struct mref_object *mref) { int index; struct sio_threadinfo *tinfo; struct sio_mref_aspect *mref_a; unsigned long flags; _mref_check(mref); mref_a = sio_mref_get_aspect(output->brick, mref); if (unlikely(!mref_a)) { MARS_FAT("cannot get aspect\n"); SIMPLE_CALLBACK(mref, -EINVAL); return; } if (unlikely(!output->brick->power.led_on)) { SIMPLE_CALLBACK(mref, -EBADFD); return; } atomic_inc(&mars_global_io_flying); atomic_inc(&output->work_count); _mref_get(mref); mapfree_set(output->mf, mref->ref_pos, -1); index = 0; if (!(mref->ref_flags & MREF_WRITE)) { traced_lock(&output->g_lock, flags); index = output->index++; traced_unlock(&output->g_lock, flags); index = (index % WITH_THREAD) + 1; } tinfo = &output->tinfo[index]; MARS_IO("queueing %p on %d\n", mref, index); atomic_inc(&tinfo->total_count); atomic_inc(&tinfo->queue_count); traced_lock(&tinfo->lock, flags); list_add_tail(&mref_a->io_head, &tinfo->mref_list); traced_unlock(&tinfo->lock, flags); wake_up_interruptible(&tinfo->event); } static int sio_thread(void *data) { struct sio_threadinfo *tinfo = data; MARS_INF("sio thread has started.\n"); //set_user_nice(current, -20); while (!brick_thread_should_stop()) { struct list_head *tmp = NULL; struct mref_object *mref; struct sio_mref_aspect *mref_a; unsigned long flags; wait_event_interruptible_timeout( tinfo->event, !list_empty(&tinfo->mref_list) || brick_thread_should_stop(), HZ); tinfo->last_jiffies = jiffies; traced_lock(&tinfo->lock, flags); if (!list_empty(&tinfo->mref_list)) { tmp = tinfo->mref_list.next; list_del_init(tmp); atomic_dec(&tinfo->queue_count); } traced_unlock(&tinfo->lock, flags); if (!tmp) continue; mref_a = container_of(tmp, struct sio_mref_aspect, io_head); mref = mref_a->object; MARS_IO("got %p %p\n", mref_a, mref); _sio_ref_io(tinfo, mref); } MARS_INF("sio thread has stopped.\n"); return 0; } static int sio_get_info(struct sio_output *output, struct mars_info *info) { struct file *file = output->mf->mf_filp; if (unlikely(!file || !file->f_mapping || !file->f_mapping->host)) return -EINVAL; info->tf_align = 1; info->tf_min_size = 1; info->current_size = i_size_read(file->f_mapping->host); MARS_DBG("determined file size = %lld\n", info->current_size); return 0; } //////////////// informational / statistics /////////////// static noinline char *sio_statistics(struct sio_brick *brick, int verbose) { struct sio_output *output = brick->outputs[0]; char *res = brick_string_alloc(1024); int queue_sum = 0; int fly_sum = 0; int total_sum = 0; int i; if (!res) return NULL; for (i = 1; i <= WITH_THREAD; i++) { struct sio_threadinfo *tinfo = &output->tinfo[i]; queue_sum += atomic_read(&tinfo->queue_count); fly_sum += atomic_read(&tinfo->fly_count); total_sum += atomic_read(&tinfo->total_count); } snprintf(res, 1024, "queued read = %d write = %d " "flying read = %d write = %d " "total read = %d write = %d " "\n", queue_sum, atomic_read(&output->tinfo[0].queue_count), fly_sum, atomic_read(&output->tinfo[0].fly_count), total_sum, atomic_read(&output->tinfo[0].total_count) ); return res; } static noinline void sio_reset_statistics(struct sio_brick *brick) { struct sio_output *output = brick->outputs[0]; int i; for (i = 0; i <= WITH_THREAD; i++) { struct sio_threadinfo *tinfo = &output->tinfo[i]; atomic_set(&tinfo->total_count, 0); } } //////////////// object / aspect constructors / destructors /////////////// static int sio_mref_aspect_init_fn(struct generic_aspect *_ini) { struct sio_mref_aspect *ini = (void*)_ini; INIT_LIST_HEAD(&ini->io_head); return 0; } static void sio_mref_aspect_exit_fn(struct generic_aspect *_ini) { struct sio_mref_aspect *ini = (void*)_ini; (void)ini; #if 1 CHECK_HEAD_EMPTY(&ini->io_head); #endif } MARS_MAKE_STATICS(sio); ////////////////////// brick constructors / destructors //////////////////// static int sio_brick_construct(struct sio_brick *brick) { return 0; } static int sio_switch(struct sio_brick *brick) { static int sio_nr = 0; struct sio_output *output = brick->outputs[0]; const char *path = output->brick->brick_path; int status = 0; if (brick->power.button) { int flags = O_CREAT | O_NOFOLLOW | O_RDWR | O_LARGEFILE; int index; if (brick->power.led_on) goto done; if (brick->o_direct) { flags |= O_DIRECT; MARS_INF("using O_DIRECT on %s\n", path); } mars_power_led_off((void*)brick, false); output->error = 0; output->mf = mapfree_get(path, flags, &output->error); if (unlikely(!output->mf || IS_ERR(output->mf))) { status = output->error; if (!status) status = -ENOENT; output->mf = NULL; MARS_ERR("open '%s' flags=%d status=%d\n", path, flags, status); goto done; } output->index = 0; for (index = 0; index <= WITH_THREAD; index++) { struct sio_threadinfo *tinfo = &output->tinfo[index]; tinfo->last_jiffies = jiffies; tinfo->thread = brick_thread_create(sio_thread, tinfo, "mars_sio%d", sio_nr++); if (unlikely(!tinfo->thread)) { MARS_ERR("cannot create thread\n"); status = -ENOENT; goto done; } } mars_power_led_on((void*)brick, true); } done: if (unlikely(status < 0) || !brick->power.button) { int index; int count; mars_power_led_on((void*)brick, false); for (;;) { count = atomic_read(&output->work_count); if (count <= 0) break; MARS_DBG("working on %d requests\n", count); brick_msleep(1000); } for (index = 0; index <= WITH_THREAD; index++) { struct sio_threadinfo *tinfo = &output->tinfo[index]; if (!tinfo->thread) continue; MARS_DBG("stopping thread %d\n", index); brick_thread_stop(tinfo->thread); tinfo->thread = NULL; } if (output->mf) { MARS_DBG("closing file\n"); mapfree_put(output->mf); output->mf = NULL; } mars_power_led_off((void*)brick, true); } return status; } static int sio_output_construct(struct sio_output *output) { int index; spin_lock_init(&output->g_lock); for (index = 0; index <= WITH_THREAD; index++) { struct sio_threadinfo *tinfo = &output->tinfo[index]; tinfo->output = output; spin_lock_init(&tinfo->lock); init_waitqueue_head(&tinfo->event); INIT_LIST_HEAD(&tinfo->mref_list); } return 0; } static int sio_output_destruct(struct sio_output *output) { return 0; } ///////////////////////// static structs //////////////////////// static struct sio_brick_ops sio_brick_ops = { .brick_switch = sio_switch, .brick_statistics = sio_statistics, .reset_statistics = sio_reset_statistics, }; static struct sio_output_ops sio_output_ops = { .mref_get = sio_ref_get, .mref_put = sio_ref_put, .mref_io = sio_ref_io, .mars_get_info = sio_get_info, }; const struct sio_input_type sio_input_type = { .type_name = "sio_input", .input_size = sizeof(struct sio_input), }; static const struct sio_input_type *sio_input_types[] = { &sio_input_type, }; const struct sio_output_type sio_output_type = { .type_name = "sio_output", .output_size = sizeof(struct sio_output), .master_ops = &sio_output_ops, .output_construct = &sio_output_construct, .output_destruct = &sio_output_destruct, }; static const struct sio_output_type *sio_output_types[] = { &sio_output_type, }; const struct sio_brick_type sio_brick_type = { .type_name = "sio_brick", .brick_size = sizeof(struct sio_brick), .max_inputs = 0, .max_outputs = 1, .master_ops = &sio_brick_ops, .aspect_types = sio_aspect_types, .default_input_types = sio_input_types, .default_output_types = sio_output_types, .brick_construct = &sio_brick_construct, }; EXPORT_SYMBOL_GPL(sio_brick_type); ////////////////// module init stuff ///////////////////////// int __init init_mars_sio(void) { MARS_INF("init_sio()\n"); _sio_brick_type = (void*)&sio_brick_type; return sio_register_brick_type(); } void exit_mars_sio(void) { MARS_INF("exit_sio()\n"); sio_unregister_brick_type(); }