/* * MARS Long Distance Replication Software * * This file is part of MARS project: http://schoebel.github.io/mars/ * * Copyright (C) 2010-2014 Thomas Schoebel-Theuer * Copyright (C) 2011-2014 1&1 Internet AG * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License along * with this program; if not, write to the Free Software Foundation, Inc., * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. */ // Bio brick (interface to blkdev IO via kernel bios) //#define BRICK_DEBUGGING //#define MARS_DEBUGGING //#define IO_DEBUGGING //#define FAKE_IO #include #include #include #include #include #include "brick_wait.h" #include "mars.h" #include "lib_timing.h" #include "lib_mapfree.h" #include "lib_limiter.h" #include "mars_bio.h" int bio_nr_requests = 1024; static struct timing_stats timings[2] = {}; struct threshold bio_submit_threshold = { .thr_ban = &mars_global_ban, .thr_parent = &global_io_threshold, .thr_limit = BIO_SUBMIT_MAX_LATENCY, .thr_factor = 100, .thr_plus = 0, }; EXPORT_SYMBOL_GPL(bio_submit_threshold); struct threshold bio_io_threshold[2] = { [0] = { .thr_ban = &mars_global_ban, .thr_parent = &global_io_threshold, .thr_limit = BIO_IO_R_MAX_LATENCY, .thr_factor = 10, .thr_plus = 10000, }, [1] = { .thr_ban = &mars_global_ban, .thr_parent = &global_io_threshold, .thr_limit = BIO_IO_W_MAX_LATENCY, .thr_factor = 10, .thr_plus = 10000, }, }; EXPORT_SYMBOL_GPL(bio_io_threshold); #ifdef CONFIG_MARS_DEBUG struct mars_limiter bio_throttle_read = { }; struct mars_limiter bio_throttle_write = { }; #endif ///////////////////////// own type definitions //////////////////////// ///////////////////////// own helper functions //////////////////////// /* This is called from the kernel bio layer. */ // remove_this #if defined(MARS_HAS_BI_STATUS) || defined(MARS_HAS_BI_ERROR) // end_remove_this static void bio_callback(struct bio *bio) // remove_this #else static void bio_callback(struct bio *bio, int code) #endif // end_remove_this { struct bio_mref_aspect *mref_a = bio->bi_private; struct bio_brick *brick; unsigned int rsp_nr; unsigned long flags; CHECK_PTR(mref_a, err); CHECK_PTR(mref_a->output, err); brick = mref_a->output->brick; CHECK_PTR(brick, err); // remove_this #ifdef MARS_HAS_BI_STATUS // end_remove_this mref_a->status_code = blk_status_to_errno(bio->bi_status); // remove_this #else #ifdef MARS_HAS_BI_ERROR mref_a->status_code = bio->bi_error; #else mref_a->status_code = code; #endif #endif // end_remove_this spin_lock_irqsave(&brick->lock, flags); list_del(&mref_a->io_head); rsp_nr = (brick->rsp_nr + 1) % BIO_RESPONSE_THREADS; brick->rsp_nr = rsp_nr; list_add_tail(&mref_a->io_head, &brick->rsp[rsp_nr].completed_list); atomic_inc(&brick->completed_count); spin_unlock_irqrestore(&brick->lock, flags); brick_wake_smp(&brick->rsp[rsp_nr].response_event); return; err: MARS_FAT("cannot handle bio callback\n"); } /* Map from kernel address/length to struct page (if not already known), * check alignment constraints, create bio from it. * Return the length (may be smaller than requested). */ static int make_bio(struct bio_brick *brick, void *data, int len, loff_t pos, struct bio_mref_aspect *private, struct bio **_bio) { unsigned long long sector; int sector_offset; int data_offset; int page_offset; int page_len; int bvec_count; int rest_len = len; int result_len = 0; int ms = 0; int status; int i; struct bio *bio = NULL; struct block_device *bdev; status = -EINVAL; CHECK_PTR(brick, out); bdev = brick->bdev; CHECK_PTR(bdev, out); if (unlikely(rest_len <= 0)) { MARS_ERR("bad bio len %d\n", rest_len); goto out; } sector = pos >> 9; // TODO: make dynamic sector_offset = pos & ((1 << 9) - 1); // TODO: make dynamic data_offset = ((unsigned long)data) & ((1 << 9) - 1); // TODO: make dynamic if (unlikely(sector_offset > 0)) { MARS_ERR("odd sector offset %d\n", sector_offset); goto out; } if (unlikely(sector_offset != 0)) { MARS_ERR("bad alignment: sector_offset %d != 0\n", sector_offset); goto out; } if (unlikely(rest_len & ((1 << 9) - 1))) { MARS_ERR("odd length %d\n", rest_len); goto out; } page_offset = ((unsigned long)data) & (PAGE_SIZE-1); page_len = rest_len + page_offset; bvec_count = (page_len - 1) / PAGE_SIZE + 1; if (bvec_count > brick->bvec_max) { bvec_count = brick->bvec_max; } else if (unlikely(bvec_count <= 0)) { MARS_WRN("bvec_count=%d\n", bvec_count); bvec_count = 1; } MARS_IO("sector_offset = %d data = %p pos = %lld rest_len = %d page_offset = %d page_len = %d bvec_count = %d\n", sector_offset, data, pos, rest_len, page_offset, page_len, bvec_count); retry_bio_alloc: bio = bio_alloc(GFP_MARS, bvec_count); if (unlikely(!bio)) { msleep_backoff(&ms); goto retry_bio_alloc; } for (i = 0; i < bvec_count && rest_len > 0; i++) { struct page *page; int this_rest = PAGE_SIZE - page_offset; int this_len = rest_len; if (this_len > this_rest) { this_len = this_rest; } #ifdef MARS_DEBUGGING if (unlikely(!virt_addr_valid(data))) { MARS_ERR("invalid virtual kernel address %p\n", data); status = -EINVAL; goto out; } #endif page = brick_iomap(data, &page_offset, &this_len); if (unlikely(!page)) { MARS_ERR("cannot iomap() kernel address %p\n", data); status = -EINVAL; goto out; } MARS_IO(" i = %d page = %p bv_len = %d bv_offset = %d\n", i, page, this_len, page_offset); bio->bi_io_vec[i].bv_page = page; bio->bi_io_vec[i].bv_len = this_len; bio->bi_io_vec[i].bv_offset = page_offset; data += this_len; rest_len -= this_len; result_len += this_len; page_offset = 0; //MARS_IO("page_offset=%d this_len=%d (new len=%d, new status=%d)\n", page_offset, this_len, rest_len, status); } if (unlikely(rest_len != 0)) { MARS_ERR("computation of bvec_count %d was wrong, diff=%d\n", bvec_count, rest_len); status = -EINVAL; goto out; } bio->bi_vcnt = i; // remove_this #ifdef MARS_HAS_BVEC_ITER // end_remove_this bio->bi_iter.bi_idx = 0; bio->bi_iter.bi_size = result_len; bio->bi_iter.bi_sector = sector; // remove_this #else bio->bi_idx = 0; bio->bi_size = result_len; bio->bi_sector = sector; #endif #ifdef MARS_HAS_SET_DEV // end_remove_this bio_set_dev(bio, bdev); // remove_this #else bio->bi_bdev = bdev; #endif // end_remove_this bio->bi_private = private; bio->bi_end_io = bio_callback; #ifndef MARS_HAS_NEW_BIO_OP bio->bi_rw = 0; // must be filled in later #endif status = result_len; out: if (unlikely(status < 0)) { MARS_ERR("error %d\n", status); if (bio) { bio_put(bio); bio = NULL; } } *_bio = bio; return status; } ////////////////// own brick / input / output operations ////////////////// #define PRIO_INDEX(mref) ((mref)->ref_prio + 1) static int bio_get_info(struct bio_output *output, struct mars_info *info) { struct bio_brick *brick = output->brick; struct inode *inode; struct request_queue *q; int status = 0; if (unlikely(!brick->mf || !brick->mf->mf_filp || !brick->mf->mf_filp->f_mapping || !(inode = brick->mf->mf_filp->f_mapping->host))) { status = -ENOENT; goto done; } info->tf_align = 512; info->tf_min_size = 512; q = bdev_get_queue(inode->i_bdev); if (q) { info->tf_align = queue_physical_block_size(q); info->tf_min_size = queue_logical_block_size(q); } brick->total_size = i_size_read(inode); info->current_size = brick->total_size; MARS_DBG("determined device size = %lld\n", info->current_size); done: return status; } static int bio_ref_get(struct bio_output *output, struct mref_object *mref) { struct bio_mref_aspect *mref_a; int status = -EINVAL; CHECK_PTR(output, done); CHECK_PTR(output->brick, done); if (mref->ref_initialized) { _mref_get(mref); return mref->ref_len; } mref_a = bio_mref_get_aspect(output->brick, mref); CHECK_PTR(mref_a, done); mref_a->output = output; mref_a->bio = NULL; if (!mref->ref_data) { // buffered IO. if (unlikely(mref->ref_len <= 0)) { goto done; } mref_a->alloc_len = mref->ref_len; mref->ref_data = brick_block_alloc(mref->ref_pos, mref->ref_len); mref_a->do_dealloc = true; } #ifdef CONFIG_MARS_DEBUG /* Only for testing, e.g. simulation of degraded RAID etc */ if (mref->ref_flags & MREF_WRITE) mars_limit_sleep(&bio_throttle_write, (mref->ref_len + 512) / 1024); else mars_limit_sleep(&bio_throttle_read, (mref->ref_len + 512) / 1024); #endif status = make_bio(output->brick, mref->ref_data, mref->ref_len, mref->ref_pos, mref_a, &mref_a->bio); if (unlikely(status < 0 || !mref_a->bio)) { MARS_ERR("could not create bio, status = %d\n", status); goto done; } if (unlikely(mref->ref_prio < MARS_PRIO_HIGH)) mref->ref_prio = MARS_PRIO_HIGH; else if (unlikely(mref->ref_prio > MARS_PRIO_LOW)) mref->ref_prio = MARS_PRIO_LOW; MARS_IO("len = %d status = %d prio = %d fly = %d\n", mref->ref_len, status, mref->ref_prio, atomic_read(&output->brick->fly_count[PRIO_INDEX(mref)])); mref->ref_len = status; _mref_get_first(mref); status = 0; done: return status; } static void _bio_ref_put(struct bio_output *output, struct mref_object *mref) { struct bio_mref_aspect *mref_a; MARS_IO("deallocating\n"); mref->ref_total_size = output->brick->total_size; mref_a = bio_mref_get_aspect(output->brick, mref); CHECK_PTR(mref_a, err); if (likely(mref_a->bio)) { #ifdef MARS_DEBUGGING int bi_cnt = atomic_read(&mref_a->bio->bi_cnt); if (bi_cnt > 1) { MARS_DBG("bi_cnt = %d\n", bi_cnt); } #endif bio_put(mref_a->bio); mref_a->bio = NULL; } if (mref_a->do_dealloc) { MARS_IO("free page\n"); brick_block_free(mref->ref_data, mref_a->alloc_len); } bio_free_mref(mref); return; err: MARS_FAT("cannot work\n"); } #define BIO_REF_PUT(output,mref) \ ({ \ if (_mref_put(mref)) { \ _bio_ref_put(output, mref); \ } \ }) static void bio_ref_put(struct bio_output *output, struct mref_object *mref) { BIO_REF_PUT(output, mref); } static void _bio_ref_io(struct bio_output *output, struct mref_object *mref, bool cork) { struct bio_brick *brick = output->brick; struct bio_mref_aspect *mref_a = bio_mref_get_aspect(output->brick, mref); struct bio *bio; unsigned long long latency; unsigned long flags; int rw; int status = -EINVAL; CHECK_PTR(mref_a, err); bio = mref_a->bio; CHECK_PTR(bio, err); _mref_get(mref); atomic_inc(&brick->fly_count[PRIO_INDEX(mref)]); bio_get(bio); rw = (mref->ref_flags & MREF_WRITE) ? WRITE : READ; if (cork) { // adapt to different kernel versions (TBD: improve) #ifdef REQ_IDLE rw |= REQ_IDLE; #else /* sorry this went clumsy over time, adaptation to _any_ kernel is a hell */ } else { #if defined(BIO_RW_RQ_MASK) || defined(BIO_FLUSH) rw |= (1 << BIO_RW_NOIDLE); #elif defined(REQ_NOIDLE) rw |= REQ_NOIDLE; #else #warning Cannot control the NOIDLE flag #endif #endif } if (!(mref->ref_flags & MREF_SKIP_SYNC)) { if (brick->do_sync) { #if defined(BIO_RW_RQ_MASK) || defined(BIO_FLUSH) rw |= (1 << BIO_RW_SYNCIO); #elif defined(REQ_SYNC) rw |= REQ_SYNC; #else #warning Cannot control the SYNC flag #endif } #if defined(BIO_RW_RQ_MASK) || defined(BIO_FLUSH) if (brick->do_unplug && !cork) { rw |= (1 << BIO_RW_UNPLUG); } #else // there is no substitute, but the above NOIDLE should do the job (CHECK!) #endif } MARS_IO("starting IO rw = %d prio 0 %d fly = %d\n", rw, mref->ref_prio, atomic_read(&brick->fly_count[PRIO_INDEX(mref)])); mars_trace(mref, "bio_submit"); mref_a->start_stamp = cpu_clock(raw_smp_processor_id()); spin_lock_irqsave(&brick->lock, flags); list_add_tail(&mref_a->io_head, &brick->submitted_list[rw & 1]); spin_unlock_irqrestore(&brick->lock, flags); #ifdef FAKE_IO bio->bi_end_io(bio, 0); #else #ifdef MARS_HAS_NEW_BIO_OP #ifdef REQ_SYNC /* Unsure: REQ_SYNC had been defined since around 2010, * but has not been used accidentally. * Thus it is now a _potential_ bug fix. * Theoretically, I could wipe out the old code. * However, I know of certain Frankenstein kernels * which don't conform to anything. And I am not their * maintainer. Let the old code in place for now. */ if (rw & 1) { bio_set_op_attrs(bio, REQ_OP_WRITE, mref->ref_skip_sync ? 0 : REQ_SYNC); } else { bio_set_op_attrs(bio, REQ_OP_READ, mref->ref_skip_sync ? 0 : REQ_META); } #else if (rw & 1) { bio_set_op_attrs(bio, REQ_OP_WRITE, mref->ref_flags & MREF_SKIP_SYNC ? 0 : WRITE_SYNC); } else { bio_set_op_attrs(bio, REQ_OP_READ, mref->ref_flags & MREF_SKIP_SYNC ? 0 : READ_SYNC); } #endif latency = TIME_STATS( &timings[rw & 1], submit_bio(bio) ); #else bio->bi_rw = rw; latency = TIME_STATS( &timings[rw & 1], submit_bio(rw, bio) ); #endif #endif cond_resched(); threshold_check(&bio_submit_threshold, latency); status = 0; #ifdef BIO_EOPNOTSUPP /* missing since b25de9d6da49b1a8760a89672283128aa8c78345 */ if (unlikely(bio_flagged(bio, BIO_EOPNOTSUPP))) status = -EOPNOTSUPP; #endif MARS_IO("submitted\n"); if (likely(status >= 0)) goto done; bio_put(bio); atomic_dec(&brick->fly_count[PRIO_INDEX(mref)]); err: MARS_ERR("IO error %d\n", status); CHECKED_CALLBACK(mref, status, done); atomic_dec(&mars_global_io_flying); done: ; } static void bio_ref_io(struct bio_output *output, struct mref_object *mref) { CHECK_PTR(mref, fatal); _mref_get(mref); atomic_inc(&mars_global_io_flying); if (mref->ref_prio == MARS_PRIO_LOW || (mref->ref_prio == MARS_PRIO_NORMAL && (mref->ref_flags & MREF_WRITE))) { struct bio_mref_aspect *mref_a = bio_mref_get_aspect(output->brick, mref); struct bio_brick *brick = output->brick; unsigned long flags; spin_lock_irqsave(&brick->lock, flags); list_add_tail(&mref_a->io_head, &brick->queue_list[PRIO_INDEX(mref)]); atomic_inc(&brick->queue_count[PRIO_INDEX(mref)]); spin_unlock_irqrestore(&brick->lock, flags); brick->submitted = true; brick_wake_smp(&brick->submit_event); return; } // realtime IO: start immediately _bio_ref_io(output, mref, false); BIO_REF_PUT(output, mref); return; fatal: MARS_FAT("cannot handle mref %p on output %p\n", mref, output); } static int bio_response_thread(void *data) { struct bio_response *rsp = data; struct bio_brick *brick = rsp->brick; #ifdef IO_DEBUGGING int round = 0; #endif MARS_INF("bio response thread has started on '%s'.\n", brick->brick_path); for (;;) { LIST_HEAD(tmp_list); unsigned long flags; int thr_limit; int sleeptime; int count; int i; thr_limit = bio_io_threshold[0].thr_limit; if (bio_io_threshold[1].thr_limit < thr_limit) thr_limit = bio_io_threshold[1].thr_limit; sleeptime = HZ / 10; if (thr_limit > 0) { sleeptime = thr_limit / (1000000 * 2 / HZ); if (unlikely(sleeptime < 2)) sleeptime = 2; } #ifdef IO_DEBUGGING round++; MARS_IO("%d sleeping %d...\n", round, sleeptime); #endif brick_wait_smp( rsp->response_event, atomic_read(&brick->completed_count) > 0 || (brick_thread_should_stop() && atomic_read(&brick->fly_count[0]) + atomic_read(&brick->fly_count[1]) + atomic_read(&brick->fly_count[2]) <= 0), sleeptime); MARS_IO("%d woken up, completed_count = %d fly_count[0] = %d fly_count[1] = %d fly_count[2] = %d\n", round, atomic_read(&brick->completed_count), atomic_read(&brick->fly_count[0]), atomic_read(&brick->fly_count[1]), atomic_read(&brick->fly_count[2])); #ifdef CONFIG_MARS_DEBUG if (mars_hang_mode & 2) { brick_msleep(100); continue; } #endif spin_lock_irqsave(&brick->lock, flags); list_replace_init(&rsp->completed_list, &tmp_list); spin_unlock_irqrestore(&brick->lock, flags); count = 0; for (;;) { struct list_head *tmp; struct bio_mref_aspect *mref_a; struct mref_object *mref; unsigned long long latency; int rw; int code; if (list_empty(&tmp_list)) { if (brick_thread_should_stop() && atomic_read(&brick->fly_count[0]) + atomic_read(&brick->fly_count[1]) + atomic_read(&brick->fly_count[2]) <= 0) goto done; break; } tmp = tmp_list.next; list_del_init(tmp); atomic_dec(&brick->completed_count); mref_a = container_of(tmp, struct bio_mref_aspect, io_head); mref = mref_a->object; rw = mref->ref_flags & MREF_WRITE ? 1 : 0; latency = cpu_clock(raw_smp_processor_id()) - mref_a->start_stamp; threshold_check(&bio_io_threshold[rw], latency); code = mref_a->status_code; #ifdef IO_DEBUGGING round++; MARS_IO("%d completed , status = %d\n", round, code); #endif mars_trace(mref, "bio_endio"); if (code < 0) { MARS_ERR("IO error %d\n", code); } else { mref_checksum(mref); mref->ref_flags |= MREF_UPTODATE; } SIMPLE_CALLBACK(mref, code); MARS_IO("%d callback done.\n", round); atomic_dec(&brick->fly_count[PRIO_INDEX(mref)]); #ifdef MARS_BIO_DEBUG atomic_inc(&brick->total_completed_count[PRIO_INDEX(mref)]); #endif count++; MARS_IO("%d completed_count = %d fly_count = %d\n", round, atomic_read(&brick->completed_count), atomic_read(&brick->fly_count[PRIO_INDEX(mref)])); if (likely(mref_a->bio)) { bio_put(mref_a->bio); } BIO_REF_PUT(mref_a->output, mref); atomic_dec(&mars_global_io_flying); } /* Try to detect slow requests as early as possible, * even before they have completed. */ for (i = 0; i < 2; i++) { unsigned long long eldest = 0; spin_lock_irqsave(&brick->lock, flags); if (!list_empty(&brick->submitted_list[i])) { struct bio_mref_aspect *mref_a; mref_a = container_of(brick->submitted_list[i].next, struct bio_mref_aspect, io_head); eldest = mref_a->start_stamp; } spin_unlock_irqrestore(&brick->lock, flags); if (eldest) { threshold_check(&bio_io_threshold[i], cpu_clock(raw_smp_processor_id()) - eldest); } } if (count) { brick->submitted = true; brick_wake_smp(&brick->submit_event); } } done: MARS_INF("bio response thread has stopped.\n"); return 0; } static bool _bg_should_run(struct bio_brick *brick) { return (atomic_read(&brick->queue_count[2]) > 0 && atomic_read(&brick->fly_count[0]) + atomic_read(&brick->fly_count[1]) <= brick->bg_threshold && (brick->bg_maxfly <= 0 || atomic_read(&brick->fly_count[2]) < brick->bg_maxfly)); } static int bio_submit_thread(void *data) { struct bio_brick *brick = data; #ifdef IO_DEBUGGING int round = 0; #endif MARS_INF("bio submit thread has started on '%s'.\n", brick->brick_path); while (!brick_thread_should_stop()) { int prio; #ifdef IO_DEBUGGING round++; MARS_IO("%d sleeping...\n", round); #endif brick_wait_smp( brick->submit_event, brick->submitted || brick_thread_should_stop(), HZ / 2); brick->submitted = false; MARS_IO("%d woken up, completed_count = %d fly_count[0] = %d fly_count[1] = %d fly_count[2] = %d\n", round, atomic_read(&brick->completed_count), atomic_read(&brick->fly_count[0]), atomic_read(&brick->fly_count[1]), atomic_read(&brick->fly_count[2])); for (prio = 0; prio < MARS_PRIO_NR; prio++) { LIST_HEAD(tmp_list); unsigned long flags; if (prio == MARS_PRIO_NR-1 && !_bg_should_run(brick)) { break; } MARS_IO("%d pushing prio %d to foreground, completed_count = %d\n", round, prio, atomic_read(&brick->completed_count)); spin_lock_irqsave(&brick->lock, flags); list_replace_init(&brick->queue_list[prio], &tmp_list); spin_unlock_irqrestore(&brick->lock, flags); while (!list_empty(&tmp_list)) { struct list_head *tmp = tmp_list.next; struct bio_mref_aspect *mref_a; struct mref_object *mref; bool cork; list_del_init(tmp); mref_a = container_of(tmp, struct bio_mref_aspect, io_head); mref = mref_a->object; if (unlikely(!mref)) { MARS_ERR("invalid mref\n"); continue; } atomic_dec(&brick->queue_count[PRIO_INDEX(mref)]); cork = atomic_read(&brick->queue_count[PRIO_INDEX(mref)]) > 0; _bio_ref_io(mref_a->output, mref, cork); BIO_REF_PUT(mref_a->output, mref); } } } MARS_INF("bio submit thread has stopped.\n"); return 0; } static int bio_switch(struct bio_brick *brick) { int status = 0; int i; if (brick->power.button) { if (brick->power.led_on) goto done; mars_power_led_off((void*)brick, false); if (!brick->bdev) { static int index = 0; const char *path = brick->brick_path; int flags = O_RDWR | O_EXCL | O_LARGEFILE; struct address_space *mapping; struct inode *inode; struct request_queue *q; #ifdef MARS_HAS_BDI_GET struct backing_dev_info *bdi; #endif brick->error = 0; brick->mf = mapfree_get(path, flags, &brick->error); if (unlikely(!brick->mf)) { status = brick->error; if (!status) status = -ENOENT; MARS_ERR("cannot open file '%s', error=%d\n", path, brick->error); goto done; } mapfree_pages(brick->mf, -1); if (unlikely(!(mapping = brick->mf->mf_filp->f_mapping) || !(inode = mapping->host))) { MARS_ERR("internal problem with '%s'\n", path); status = -EINVAL; goto done; } if (unlikely(!S_ISBLK(inode->i_mode) || !inode->i_bdev)) { MARS_ERR("sorry, '%s' is not a block device\n", path); status = -ENODEV; goto done; } mapping_set_gfp_mask(mapping, mapping_gfp_mask(mapping) & ~(__GFP_IO | __GFP_FS)); q = bdev_get_queue(inode->i_bdev); if (unlikely(!q)) { MARS_ERR("internal queue '%s' does not exist\n", path); status = -EINVAL; goto done; } #ifdef MARS_HAS_BDI_GET bdi = I_BDEV(inode)->bd_bdi; MARS_INF("'%s' ra_pages OLD=%lu NEW=%d\n", path, bdi->ra_pages, brick->ra_pages); bdi->ra_pages = brick->ra_pages; #else MARS_INF("'%s' ra_pages OLD=%lu NEW=%d\n", path, q->backing_dev_info.ra_pages, brick->ra_pages); q->backing_dev_info.ra_pages = brick->ra_pages; #endif q->nr_requests = bio_nr_requests; brick->bvec_max = queue_max_hw_sectors(q) >> (PAGE_SHIFT - 9); if (brick->bvec_max > BIO_MAX_PAGES) brick->bvec_max = BIO_MAX_PAGES; else if (brick->bvec_max <= 1) brick->bvec_max = 1; brick->total_size = i_size_read(inode); MARS_INF("'%s' size=%lld bvec_max=%d\n", path, brick->total_size, brick->bvec_max); for (i = 0; i < BIO_RESPONSE_THREADS; i++) { brick->rsp[i].response_thread = brick_thread_create(bio_response_thread, &brick->rsp[i], "mars_bio_r%d", index); } brick->submit_thread = brick_thread_create(bio_submit_thread, brick, "mars_bio_s%d", index); status = -EBADR; if (likely(brick->submit_thread)) { brick->bdev = inode->i_bdev; brick->mode_ptr = &brick->mf->mf_mode; index++; status = 0; } } } mars_power_led_on((void*)brick, brick->power.button && brick->bdev != NULL); done: if (status < 0 || !brick->power.button) { if (brick->submit_thread) { brick_thread_stop(brick->submit_thread); brick->submit_thread = NULL; } for (i = 0; i < BIO_RESPONSE_THREADS; i++) { if (brick->rsp[i].response_thread) { brick_thread_stop(brick->rsp[i].response_thread); brick->rsp[i].response_thread = NULL; } } if (brick->mf) { mapfree_put(brick->mf); brick->mf = NULL; } brick->mode_ptr = NULL; brick->bdev = NULL; if (!brick->power.button) { mars_power_led_off((void*)brick, true); brick->total_size = 0; } } return status; } //////////////// informational / statistics /////////////// static noinline char *bio_statistics(struct bio_brick *brick, int verbose) { char *res = brick_string_alloc(4096); int pos = 0; if (!res) return NULL; pos += report_timing(&timings[0], res + pos, 4096 - pos); pos += report_timing(&timings[1], res + pos, 4096 - pos); snprintf(res + pos, 4096 - pos, #ifdef MARS_BIO_DEBUG "total " "completed[0] = %d " "completed[1] = %d " "completed[2] = %d | " #endif "queued[0] = %d " "queued[1] = %d " "queued[2] = %d " "flying[0] = %d " "flying[1] = %d " "flying[2] = %d " "completing = %d\n", #ifdef MARS_BIO_DEBUG atomic_read(&brick->total_completed_count[0]), atomic_read(&brick->total_completed_count[1]), atomic_read(&brick->total_completed_count[2]), #endif atomic_read(&brick->fly_count[0]), atomic_read(&brick->queue_count[0]), atomic_read(&brick->queue_count[1]), atomic_read(&brick->queue_count[2]), atomic_read(&brick->fly_count[1]), atomic_read(&brick->fly_count[2]), atomic_read(&brick->completed_count)); return res; } static noinline void bio_reset_statistics(struct bio_brick *brick) { #ifdef MARS_BIO_DEBUG atomic_set(&brick->total_completed_count[0], 0); atomic_set(&brick->total_completed_count[1], 0); atomic_set(&brick->total_completed_count[2], 0); #endif } //////////////// object / aspect constructors / destructors /////////////// static int bio_mref_aspect_init_fn(struct generic_aspect *_ini) { struct bio_mref_aspect *ini = (void*)_ini; INIT_LIST_HEAD(&ini->io_head); return 0; } static void bio_mref_aspect_exit_fn(struct generic_aspect *_ini) { struct bio_mref_aspect *ini = (void*)_ini; (void)ini; } MARS_MAKE_STATICS(bio); ////////////////////// brick constructors / destructors //////////////////// static int bio_brick_construct(struct bio_brick *brick) { int i; spin_lock_init(&brick->lock); INIT_LIST_HEAD(&brick->queue_list[0]); INIT_LIST_HEAD(&brick->queue_list[1]); INIT_LIST_HEAD(&brick->queue_list[2]); INIT_LIST_HEAD(&brick->submitted_list[0]); INIT_LIST_HEAD(&brick->submitted_list[1]); init_waitqueue_head(&brick->submit_event); for (i = 0; i < BIO_RESPONSE_THREADS; i++) { INIT_LIST_HEAD(&brick->rsp[i].completed_list); init_waitqueue_head(&brick->rsp[i].response_event); brick->rsp[i].brick = brick; } return 0; } static int bio_brick_destruct(struct bio_brick *brick) { int i; CHECK_HEAD_EMPTY(&brick->queue_list[0]); CHECK_HEAD_EMPTY(&brick->queue_list[1]); CHECK_HEAD_EMPTY(&brick->queue_list[2]); CHECK_HEAD_EMPTY(&brick->submitted_list[0]); CHECK_HEAD_EMPTY(&brick->submitted_list[1]); for (i = 0; i < BIO_RESPONSE_THREADS; i++) { CHECK_HEAD_EMPTY(&brick->rsp[i].completed_list); } return 0; } static int bio_output_construct(struct bio_output *output) { return 0; } static int bio_output_destruct(struct bio_output *output) { return 0; } ///////////////////////// static structs //////////////////////// static struct bio_brick_ops bio_brick_ops = { .brick_switch = bio_switch, .brick_statistics = bio_statistics, .reset_statistics = bio_reset_statistics, }; static struct bio_output_ops bio_output_ops = { .mars_get_info = bio_get_info, .mref_get = bio_ref_get, .mref_put = bio_ref_put, .mref_io = bio_ref_io, }; const struct bio_input_type bio_input_type = { .type_name = "bio_input", .input_size = sizeof(struct bio_input), }; static const struct bio_input_type *bio_input_types[] = { &bio_input_type, }; const struct bio_output_type bio_output_type = { .type_name = "bio_output", .output_size = sizeof(struct bio_output), .master_ops = &bio_output_ops, .output_construct = &bio_output_construct, .output_destruct = &bio_output_destruct, }; static const struct bio_output_type *bio_output_types[] = { &bio_output_type, }; const struct bio_brick_type bio_brick_type = { .type_name = "bio_brick", .brick_size = sizeof(struct bio_brick), .max_inputs = 0, .max_outputs = 1, .master_ops = &bio_brick_ops, .aspect_types = bio_aspect_types, .default_input_types = bio_input_types, .default_output_types = bio_output_types, .brick_construct = &bio_brick_construct, .brick_destruct = &bio_brick_destruct, }; EXPORT_SYMBOL_GPL(bio_brick_type); ////////////////// module init stuff ///////////////////////// int __init init_mars_bio(void) { MARS_INF("init_bio()\n"); _bio_brick_type = (void*)&bio_brick_type; return bio_register_brick_type(); } void exit_mars_bio(void) { MARS_INF("exit_bio()\n"); bio_unregister_brick_type(); }