// (c) 2010 Thomas Schoebel-Theuer / 1&1 Internet AG // Buf brick (just for demonstration) //#define MARS_DEBUGGING #include #include #include #include #include #include #include "mars.h" ///////////////////////// own type definitions //////////////////////// #include "mars_buf.h" ///////////////////////// own helper functions //////////////////////// static inline int buf_hash(struct buf_brick *brick, loff_t pos) { return (pos >> brick->backing_order) % MARS_BUF_HASH_MAX; } static struct buf_head *hash_find(struct buf_brick *brick, loff_t pos) { int hash = buf_hash(brick, pos); struct list_head *start = &brick->cache_anchors[hash]; struct list_head *tmp; struct buf_head *res; for (tmp = start->next; ; tmp = tmp->next) { if (tmp == start) return NULL; res = container_of(tmp, struct buf_head, bf_hash_head); if (res->bf_pos == pos) break; } return res; } static inline void hash_insert(struct buf_brick *brick, struct buf_head *elem) { int hash = buf_hash(brick, elem->bf_pos); struct list_head *start = &brick->cache_anchors[hash]; list_add(&elem->bf_hash_head, start); } static inline void free_buf(struct buf_brick *brick, struct buf_head *bf) { free_pages((unsigned long)bf->bf_data, brick->backing_order); kfree(bf); } /* brick->buf_lock must be held */ static inline void prune_cache(struct buf_brick *brick) { while (brick->alloc_count > brick->max_count) { struct buf_head *bf; if (list_empty(&brick->free_anchor)) break; bf = container_of(brick->free_anchor.next, struct buf_head, bf_lru_head); list_del(&bf->bf_lru_head); brick->current_count--; brick->alloc_count--; spin_unlock_irq(&brick->buf_lock); free_buf(brick, bf); spin_lock_irq(&brick->buf_lock); } } static struct mars_io_object_layout *buf_init_object_layout(struct buf_output *output) { const int layout_size = 1024; const int max_aspects = 16; struct mars_io_object_layout *res; int status; void *data = kzalloc(layout_size, GFP_KERNEL); if (!data) { MARS_ERR("emergency, cannot allocate object_layout!\n"); return NULL; } res = mars_io_init_object_layout(data, layout_size, max_aspects, &mars_io_type); if (unlikely(!res)) { MARS_ERR("emergency, cannot init object_layout!\n"); goto err_free; } status = output->ops->make_object_layout(output, (struct generic_object_layout*)res); if (unlikely(status < 0)) { MARS_ERR("emergency, cannot add aspects to object_layout!\n"); goto err_free; } MARS_INF("OK, object_layout init succeeded.\n"); return res; err_free: kfree(res); return NULL; } static inline int get_info(struct buf_brick *brick) { struct buf_input *input = brick->inputs[0]; int status = GENERIC_INPUT_CALL(input, mars_get_info, &brick->base_info); if (status >= 0) { brick->got_info = 1; } return status; } /* Convert from arbitrary kernel address/length to struct page, * create bio from it, round up/down to full sectors. * return the length (may be smaller than requested) */ static int make_bio(struct buf_brick *brick, struct bio **_bio, void *data, int len, loff_t pos) { unsigned long sector; int sector_offset; int page_offset; int bvec_count; int status; int i; struct page *page; struct bio *bio = NULL; struct block_device *bdev; if (unlikely(!brick->got_info)) { struct request_queue *q; status = get_info(brick); if (status < 0) goto out; bdev = brick->base_info.backing_file->f_mapping->host->i_sb->s_bdev; q = bdev_get_queue(bdev); brick->bvec_max = queue_max_hw_sectors(q) >> (PAGE_SHIFT - 9); } else { bdev = brick->base_info.backing_file->f_mapping->host->i_sb->s_bdev; } sector = pos << 9; // TODO: make dynamic sector_offset = pos & ((1 << 9) - 1); // TODO: make dynamic // round down to start of first sector data -= sector_offset; len += sector_offset; pos -= sector_offset; page_offset = pos & (PAGE_SIZE - 1); bvec_count = len / PAGE_SIZE + 1; if (bvec_count > brick->bvec_max) bvec_count = brick->bvec_max; bio = bio_alloc(GFP_KERNEL, bvec_count); status = -ENOMEM; if (!bio) goto out; status = 0; for (i = 0; i < bvec_count && len > 0; i++) { int myrest = PAGE_SIZE - page_offset; int mylen = len; int iolen; int iooffset; if (mylen > myrest) mylen = myrest; // round last sector up iolen = mylen; iooffset = iolen % (1 << 9); // TODO: make dynamic if (iooffset) iolen += (1 << 9) - iooffset; // TODO: make dynamic page = virt_to_page(data); bio->bi_io_vec[i].bv_page = page; bio->bi_io_vec[i].bv_len = iolen; bio->bi_io_vec[i].bv_offset = page_offset; data += mylen; len -= mylen; status += mylen; page_offset = 0; } bio->bi_vcnt = i; bio->bi_idx = 0; bio->bi_size = i * PAGE_SIZE; bio->bi_sector = sector; bio->bi_bdev = bdev; bio->bi_private = NULL; // must be filled in later bio->bi_end_io = NULL; // must be filled in later if (status >= sector_offset) status -= sector_offset; out: *_bio = bio; return status; } ////////////////// own brick / input / output operations ////////////////// static int buf_io(struct buf_output *output, struct mars_io_object *mio) { struct buf_input *input = output->brick->inputs[0]; return GENERIC_INPUT_CALL(input, mars_io, mio); } static int buf_get_info(struct buf_output *output, struct mars_info *info) { struct buf_input *input = output->brick->inputs[0]; return GENERIC_INPUT_CALL(input, mars_get_info, info); } static int buf_buf_get(struct buf_output *output, struct mars_buf_object **_mbuf, struct mars_buf_object_layout *buf_layout, loff_t pos, int len) { struct buf_brick *brick = output->brick; void *data; struct mars_buf_object *mbuf; struct buf_mars_buf_aspect *mbuf_a; struct buf_head *bf; loff_t base_pos; int status = -EILSEQ; data = kzalloc(buf_layout->object_size, GFP_KERNEL); if (!data) return -ENOMEM; mbuf = mars_buf_construct(data, buf_layout); if (!mbuf) goto err_free; mbuf_a = buf_mars_buf_get_aspect(output, mbuf); if (!mbuf_a) goto err_free; spin_lock_init(&mbuf->buf_lock); mbuf->buf_pos = pos; base_pos = pos & ~((loff_t)brick->backing_size - 1); spin_lock_irq(&brick->buf_lock); bf = hash_find(brick, base_pos); if (!bf) { MARS_DBG("buf_get() hash nothing found\n"); if (unlikely(list_empty(&brick->free_anchor))) { struct buf_head *test_bf; MARS_DBG("buf_get() alloc new buf_head\n"); spin_unlock_irq(&brick->buf_lock); status = -ENOMEM; bf = kzalloc(sizeof(struct buf_head), GFP_KERNEL); if (!bf) goto err_free; bf->bf_data = (void*)__get_free_pages(GFP_KERNEL, brick->backing_order); if (!bf->bf_data) goto err_free2; bf->bf_brick = brick; atomic_set(&bf->bf_bio_count, 0); //INIT_LIST_HEAD(&bf->bf_mbuf_anchor); INIT_LIST_HEAD(&bf->bf_lru_head); INIT_LIST_HEAD(&bf->bf_hash_head); INIT_LIST_HEAD(&bf->bf_io_pending_anchor); INIT_LIST_HEAD(&bf->bf_again_write_pending_anchor); spin_lock_irq(&brick->buf_lock); brick->alloc_count++; /* during the open lock, somebody might have raced * against us at the same base_pos... */ test_bf = hash_find(brick, base_pos); if (unlikely(test_bf)) { free_buf(brick, bf); bf = test_bf; } } else { bf = container_of(brick->free_anchor.next, struct buf_head, bf_lru_head); } MARS_DBG("buf_get() bf=%p\n", bf); bf->bf_pos = base_pos; bf->bf_flags = 0; atomic_set(&bf->bf_count, 0); hash_insert(brick, bf); brick->current_count++; } mbuf_a->bfa_bf = bf; atomic_inc(&bf->bf_count); MARS_DBG("buf_get() bf=%p initial bf_count=%d\n", bf, atomic_read(&bf->bf_count)); list_del_init(&bf->bf_lru_head); mbuf->buf_flags = bf->bf_flags; spin_unlock_irq(&brick->buf_lock); mbuf->buf_data = bf->bf_data + (pos - base_pos); mbuf->buf_len = brick->backing_size - (pos - base_pos); *_mbuf = mbuf; if (len > mbuf->buf_len) len = mbuf->buf_len; return len; err_free2: kfree(bf); err_free: kfree(data); return status; } static int _buf_buf_put(struct buf_head *bf) { struct buf_brick *brick; MARS_DBG("_buf_buf_put() bf=%p bf_count=%d\n", bf, atomic_read(&bf->bf_count)); if (!atomic_dec_and_test(&bf->bf_count)) return 0; MARS_DBG("_buf_buf_put() ZERO_COUNT\n"); brick = bf->bf_brick; spin_lock_irq(&brick->buf_lock); if (atomic_read(&bf->bf_count) <= 0) { struct list_head *where = &brick->lru_anchor; BUG_ON(bf->bf_flags & (MARS_BUF_READING | MARS_BUF_WRITING)); if (unlikely(!(bf->bf_flags & MARS_BUF_UPTODATE))) { list_del_init(&bf->bf_hash_head); brick->current_count--; where = &brick->free_anchor; } list_del(&bf->bf_lru_head); list_add(&bf->bf_lru_head, where); } // lru freeing while (brick->current_count > brick->max_count) { if (list_empty(&brick->lru_anchor)) break; bf = container_of(brick->lru_anchor.prev, struct buf_head, bf_lru_head); list_del(&bf->bf_lru_head); list_del_init(&bf->bf_hash_head); brick->current_count--; list_add(&bf->bf_lru_head, &brick->free_anchor); } prune_cache(brick); spin_unlock_irq(&brick->buf_lock); return 0; } static int buf_buf_put(struct buf_output *output, struct mars_buf_object *mbuf) { struct buf_mars_buf_aspect *mbuf_a; struct buf_head *bf; mbuf_a = buf_mars_buf_get_aspect(output, mbuf); if (!mbuf_a) return -EILSEQ; bf = mbuf_a->bfa_bf; MARS_DBG("buf_buf_put() mbuf=%p mbuf_a=%p bf=%p\n", mbuf, mbuf_a, bf); return _buf_buf_put(bf); } static int _buf_endio(struct mars_io_object *mio) { struct bio *bio = mio->orig_bio; MARS_DBG("_buf_endio() mio=%p bio=%p\n", mio, bio); if (bio) { mio->orig_bio = NULL; if (!bio->bi_size) { bio_endio(bio, 0); } else { MARS_ERR("NYI: RETRY LOGIC %u\n", bio->bi_size); bio_endio(bio, -EIO); } } // else lower layers have already signalled the orig_bio kfree(mio); return 0; } static void _buf_bio_callback(struct bio *bio, int code); static int _buf_make_bios(struct buf_brick *brick, struct buf_head *bf, void *start_data, loff_t start_pos, int start_len) { while (start_len > 0) { struct buf_input *input = brick->inputs[0]; struct mars_io_object *data; struct mars_io_object *mio; struct buf_mars_io_aspect *mio_a; struct bio *bio = NULL; int len; int status; if (unlikely(!brick->mio_layout)) { brick->mio_layout = buf_init_object_layout(brick->outputs[0]); if (!brick->mio_layout) return -ENOMEM; } data = kzalloc(brick->mio_layout->object_size, GFP_KERNEL); if (!data) return -ENOMEM; mio = mars_io_construct(data, brick->mio_layout); if (!mio) goto err_free; mio_a = buf_mars_io_get_aspect(brick->outputs[0], mio); if (!mio_a) goto err_free2; len = make_bio(brick, &bio, start_data, start_len, start_pos); if (len < 0 || !bio) goto err_free2; mio_a->mia_bf = bf; atomic_inc(&bf->bf_bio_count); bio->bi_private = mio_a; bio->bi_end_io = _buf_bio_callback; mio->orig_bio = bio; mio->mars_endio = _buf_endio; MARS_DBG("starting buf IO mio=%p bio=%p len=%d bf=%p bf_count=%d bf_bio_count=%d\n", mio, bio, len, bf, atomic_read(&bf->bf_count), atomic_read(&bf->bf_bio_count)); #if 1 status = GENERIC_INPUT_CALL(input, mars_io, mio); if (status < 0) goto err_free3; #else // fake IO for testing bio->bi_size = 0; mio->mars_endio(mio); #endif start_data -= len; start_pos -= len; start_len -= len; continue; err_free3: atomic_dec(&bf->bf_bio_count); bio_put(bio); err_free2: kfree(mio); err_free: kfree(data); return -EIO; } return 0; } static void _buf_bio_callback(struct bio *bio, int code) { struct buf_mars_io_aspect *mio_a; struct buf_head *bf; struct buf_brick *brick; void *start_data = NULL; loff_t start_pos = 0; int start_len = 0; int old_flags; mio_a = bio->bi_private; bf = mio_a->mia_bf; MARS_DBG("_buf_bio_callback() mio=%p bio=%p bf=%p bf_count=%d bf_bio_count=%d code=%d\n", mio_a->object, bio, bf, atomic_read(&bf->bf_count), atomic_read(&bf->bf_bio_count), code); if (unlikely(mio_a->mia_end_io_called)) { MARS_ERR("Oops, somebody called us twice on the same bio. I'm not amused.\n"); msleep(5000); return; } else { mio_a->mia_end_io_called = true; bio_put(bio); } if (code < 0) { // this can race, but we don't worry about the exact error code bf->bf_bio_status = code; } if (!atomic_dec_and_test(&bf->bf_bio_count)) return; MARS_DBG("_buf_bio_callback() ZERO_COUNT mio=%p bio=%p bf=%p code=%d\n", mio_a->object, bio, bf, code); brick = bf->bf_brick; spin_lock_irq(&brick->buf_lock); // signal success by calling all callbacks. while (!list_empty(&bf->bf_io_pending_anchor)) { struct buf_mars_buf_callback_aspect *mbuf_cb_a = container_of(bf->bf_io_pending_anchor.next, struct buf_mars_buf_callback_aspect, bfc_pending_head); struct mars_buf_callback_object *mbuf_cb = mbuf_cb_a->object; BUG_ON(mbuf_cb_a->bfc_bfa->bfa_bf != bf); mbuf_cb->cb_error = bf->bf_bio_status; list_del(&mbuf_cb_a->bfc_pending_head); /* drop normal refcount. * full _buf_buf_put() not needed, see below. */ atomic_dec(&bf->bf_count); MARS_DBG("_buf_bio_callback() bf=%p now bf_count=%d\n", bf, atomic_read(&bf->bf_count)); spin_unlock_irq(&brick->buf_lock); mbuf_cb->cb_buf_endio(mbuf_cb); spin_lock_irq(&brick->buf_lock); } old_flags = bf->bf_flags; if (!bf->bf_bio_status && (old_flags & MARS_BUF_READING)) { bf->bf_flags |= MARS_BUF_UPTODATE; } // clear the flags. may be re-enabled later. bf->bf_flags &= ~(MARS_BUF_READING | MARS_BUF_WRITING); /* move pending jobs to work. * this is in essence an automatic restart mechanism. */ while (!list_empty(&bf->bf_again_write_pending_anchor)) { struct buf_mars_buf_callback_aspect *mbuf_cb_a = container_of(bf->bf_again_write_pending_anchor.next, struct buf_mars_buf_callback_aspect, bfc_pending_head); struct mars_buf_object *mbuf = mbuf_cb_a->bfc_bfa->object; BUG_ON(mbuf_cb_a->bfc_bfa->bfa_bf != bf); list_del(&mbuf_cb_a->bfc_pending_head); list_add_tail(&mbuf_cb_a->bfc_pending_head, &bf->bf_io_pending_anchor); // re-enable flags bf->bf_flags |= MARS_BUF_WRITING; bf->bf_bio_status = 0; if (!start_len) { start_data = mbuf->buf_data; start_pos = mbuf->buf_pos; start_len = mbuf->buf_len; } else if (start_data != mbuf->buf_data || start_pos != mbuf->buf_pos || start_len != mbuf->buf_len) { start_data = bf->bf_data; start_pos = bf->bf_pos; start_len = brick->backing_size; } } spin_unlock_irq(&brick->buf_lock); if (start_len) { // in this case, the extra refcount is kept => nothing to do _buf_make_bios(brick, bf, start_data, start_pos, start_len); } else if (old_flags & (MARS_BUF_READING | MARS_BUF_WRITING)) { // drop extra refcount for pending IO _buf_buf_put(bf); } } static int buf_buf_io(struct buf_output *output, struct mars_buf_callback_object *mbuf_cb) { struct buf_brick *brick = output->brick; struct mars_buf_object *mbuf = mbuf_cb->cb_mbuf; struct buf_mars_buf_aspect *mbuf_a; struct buf_mars_buf_callback_aspect *mbuf_cb_a; struct buf_head *bf; void *start_data = NULL; loff_t start_pos = 0; int start_len = 0; if (!mbuf) { MARS_ERR("internal problem: forgotten to supply mbuf\n"); return -EILSEQ; } mbuf_cb_a = buf_mars_buf_callback_get_aspect(output, mbuf_cb); if (!mbuf_cb_a) { MARS_ERR("internal problem: mbuf_cb aspect does not work\n"); return -EILSEQ; } mbuf_a = buf_mars_buf_get_aspect(output, mbuf); if (!mbuf_a) { MARS_ERR("internal problem: mbuf aspect does not work\n"); return -EILSEQ; } mbuf_cb_a->bfc_bfa = mbuf_a; bf = mbuf_a->bfa_bf; spin_lock_irq(&brick->buf_lock); if (mbuf_cb->cb_rw) { // WRITE BUG_ON(bf->bf_flags & MARS_BUF_READING); if (!(bf->bf_flags & MARS_BUF_WRITING)) { bf->bf_flags |= MARS_BUF_WRITING; bf->bf_bio_status = 0; // grab an extra refcount for pending IO atomic_inc(&bf->bf_count); MARS_DBG("buf_buf_io() bf=%p extra bf_count=%d\n", bf, atomic_read(&bf->bf_count)); start_data = mbuf->buf_data; start_pos = mbuf->buf_pos; start_len = mbuf->buf_len; list_add(&mbuf_cb_a->bfc_pending_head, &bf->bf_io_pending_anchor); } else { list_add(&mbuf_cb_a->bfc_pending_head, &bf->bf_again_write_pending_anchor); MARS_INF("postponing %lld %d\n", mbuf->buf_pos, mbuf->buf_len); } } else { // READ if (bf->bf_flags & (MARS_BUF_UPTODATE | MARS_BUF_WRITING)) { spin_unlock_irq(&brick->buf_lock); return mbuf_cb->cb_buf_endio(mbuf_cb); } if (!(bf->bf_flags & MARS_BUF_READING)) { bf->bf_flags |= MARS_BUF_READING; bf->bf_bio_status = 0; // grab an extra refcount for pending IO atomic_inc(&bf->bf_count); MARS_DBG("buf_buf_io() bf=%p extra bf_count=%d\n", bf, atomic_read(&bf->bf_count)); start_data = (void*)((unsigned long)mbuf->buf_data & (brick->backing_size - 1)); start_pos = mbuf->buf_pos & (brick->backing_size - 1); start_len = brick->backing_size; } list_add(&mbuf_cb_a->bfc_pending_head, &bf->bf_io_pending_anchor); } // grab normal refcount for each mbuf_cb atomic_inc(&bf->bf_count); MARS_DBG("buf_buf_io() bf=%p normal bf_count=%d\n", bf, atomic_read(&bf->bf_count)); spin_unlock_irq(&brick->buf_lock); return _buf_make_bios(brick, bf, start_data, start_pos, start_len); } //////////////// object / aspect constructors / destructors /////////////// static int buf_mars_io_aspect_init_fn(struct generic_aspect *_ini, void *_init_data) { struct buf_mars_io_aspect *ini = (void*)_ini; ini->mia_bf = NULL; ini->mia_end_io_called = false; return 0; } static int buf_mars_buf_aspect_init_fn(struct generic_aspect *_ini, void *_init_data) { struct buf_mars_buf_aspect *ini = (void*)_ini; ini->bfa_bf = NULL; return 0; } static int buf_mars_buf_callback_aspect_init_fn(struct generic_aspect *_ini, void *_init_data) { struct buf_mars_buf_callback_aspect *ini = (void*)_ini; INIT_LIST_HEAD(&ini->bfc_pending_head); ini->bfc_bfa = NULL; return 0; } MARS_MAKE_STATICS(buf); static int buf_make_object_layout(struct buf_output *output, struct generic_object_layout *object_layout) { const struct generic_object_type *object_type = object_layout->object_type; int status; int aspect_size = 0; struct buf_brick *brick = output->brick; int i; if (object_type == &mars_io_type) { aspect_size = sizeof(struct buf_mars_io_aspect); status = buf_mars_io_add_aspect(output, object_layout, &buf_mars_io_aspect_type); } else if (object_type == &mars_buf_type) { aspect_size = sizeof(struct buf_mars_buf_aspect); status = buf_mars_buf_add_aspect(output, object_layout, &buf_mars_buf_aspect_type); if (status < 0) return status; return aspect_size; } else if (object_type == &mars_buf_callback_type) { aspect_size = sizeof(struct buf_mars_buf_callback_aspect); status = buf_mars_buf_callback_add_aspect(output, object_layout, &buf_mars_buf_callback_aspect_type); if (status < 0) return status; return aspect_size; } else { return 0; } if (status < 0) return status; for (i = 0; i < brick->type->max_inputs; i++) { struct buf_input *input = brick->inputs[i]; if (input && input->connect) { int substatus = input->connect->ops->make_object_layout(input->connect, object_layout); if (substatus < 0) return substatus; aspect_size += substatus; } } return aspect_size; } ////////////////////// brick constructors / destructors //////////////////// static int buf_brick_construct(struct buf_brick *brick) { int i; brick->backing_order = 5; // TODO: make this configurable brick->backing_size = PAGE_SIZE << brick->backing_order; brick->max_count = 32; // TODO: make this configurable brick->current_count = 0; brick->alloc_count = 0; spin_lock_init(&brick->buf_lock); INIT_LIST_HEAD(&brick->free_anchor); INIT_LIST_HEAD(&brick->lru_anchor); for (i = 0; i < MARS_BUF_HASH_MAX; i++) { INIT_LIST_HEAD(&brick->cache_anchors[i]); } return 0; } static int buf_output_construct(struct buf_output *output) { return 0; } ///////////////////////// static structs //////////////////////// static struct buf_brick_ops buf_brick_ops = { }; static struct buf_output_ops buf_output_ops = { .make_object_layout = buf_make_object_layout, .mars_io = buf_io, .mars_get_info = buf_get_info, .mars_buf_get = buf_buf_get, .mars_buf_put = buf_buf_put, .mars_buf_io = buf_buf_io, }; static const struct buf_input_type buf_input_type = { .type_name = "buf_input", .input_size = sizeof(struct buf_input), }; static const struct buf_input_type *buf_input_types[] = { &buf_input_type, }; static const struct buf_output_type buf_output_type = { .type_name = "buf_output", .output_size = sizeof(struct buf_output), .master_ops = &buf_output_ops, .output_construct = &buf_output_construct, }; static const struct buf_output_type *buf_output_types[] = { &buf_output_type, }; const struct buf_brick_type buf_brick_type = { .type_name = "buf_brick", .brick_size = sizeof(struct buf_brick), .max_inputs = 1, .max_outputs = 1, .master_ops = &buf_brick_ops, .default_input_types = buf_input_types, .default_output_types = buf_output_types, .brick_construct = &buf_brick_construct, }; EXPORT_SYMBOL_GPL(buf_brick_type); ////////////////// module init stuff ///////////////////////// static int __init init_buf(void) { printk(MARS_INFO "init_buf()\n"); return buf_register_brick_type(); } static void __exit exit_buf(void) { printk(MARS_INFO "exit_buf()\n"); buf_unregister_brick_type(); } MODULE_DESCRIPTION("MARS buf brick"); MODULE_AUTHOR("Thomas Schoebel-Theuer "); MODULE_LICENSE("GPL"); module_init(init_buf); module_exit(exit_buf);