From 4be66aee51fd294bbc2b95dcec116b6946aa134c Mon Sep 17 00:00:00 2001 From: sage Date: Sat, 10 Dec 2005 00:40:04 +0000 Subject: [PATCH] *** empty log message *** git-svn-id: https://ceph.svn.sf.net/svnroot/ceph@519 29311d96-e01e-0410-9327-a35deaab8ce9 --- ceph/TODO | 12 +- ceph/ebofs/Allocator.cc | 28 +- ceph/ebofs/Allocator.h | 7 + ceph/ebofs/BlockDevice.cc | 2 +- ceph/ebofs/BufferCache.cc | 13 +- ceph/ebofs/BufferCache.h | 29 +- ceph/ebofs/Ebofs.cc | 576 +++++++++++++++++++++++-------- ceph/ebofs/Ebofs.h | 63 +++- ceph/ebofs/Onode.h | 164 ++++++++- ceph/ebofs/Table.h | 9 +- ceph/ebofs/mkfs.ebofs.cc | 6 +- ceph/ebofs/nodes.h | 658 ++++++++++++++++++++++-------------- ceph/ebofs/types.h | 22 +- ceph/include/interval_set.h | 194 +++++++++++ 14 files changed, 1316 insertions(+), 467 deletions(-) create mode 100644 ceph/include/interval_set.h diff --git a/ceph/TODO b/ceph/TODO index 15c7e775c65..15bbe5e8495 100644 --- a/ceph/TODO +++ b/ceph/TODO @@ -4,12 +4,14 @@ client ofs -- object map : object_t -> block_t -- col map : coll_t -> block_t -- freelists : block_t -> block_t +- on disk btree nodes need super_version +- commit thread +- reallocate on write +- -- objects -- collections set +- Ebofs.free_blocks +- Ebofs.readonly +- real error codes diff --git a/ceph/ebofs/Allocator.cc b/ceph/ebofs/Allocator.cc index 025edae941d..dacde19a516 100644 --- a/ceph/ebofs/Allocator.cc +++ b/ceph/ebofs/Allocator.cc @@ -85,7 +85,7 @@ int Allocator::allocate(Extent& ex, block_t num, block_t near) ex.start += left.length; ex.length -= left.length; assert(ex.length == num); - release(left); + release_now(left); } else { // take middle part. Extent left,right; @@ -95,8 +95,8 @@ int Allocator::allocate(Extent& ex, block_t num, block_t near) right.start = ex.start + num; right.length = ex.length - left.length - num; ex.length = num; - release(left); - release(right); + release_now(left); + release_now(right); } } else { @@ -105,7 +105,7 @@ int Allocator::allocate(Extent& ex, block_t num, block_t near) right.start = ex.start + num; right.length = ex.length - num; ex.length = num; - release(right); + release_now(right); } } @@ -135,7 +135,25 @@ int Allocator::allocate(Extent& ex, block_t num, block_t near) return -1; } -int Allocator::release(Extent& ex) +int Allocator::release(Extent& ex) +{ + dout(1) << "release " << ex << " (into limbo)" << endl; + limbo.insert(ex.start, ex.length); + return 0; +} + +int Allocator::release_limbo() +{ + for (map::iterator i = limbo.m.begin(); + i != limbo.m.end(); + i++) { + Extent ex(i->first, i->second); + release_now(ex); + } + return 0; +} + +int Allocator::release_now(Extent& ex) { Extent newex = ex; diff --git a/ceph/ebofs/Allocator.h b/ceph/ebofs/Allocator.h index 1c8fce4a340..02473739906 100644 --- a/ceph/ebofs/Allocator.h +++ b/ceph/ebofs/Allocator.h @@ -3,12 +3,16 @@ #include "types.h" +#include "include/interval_set.h" + class Ebofs; class Allocator { protected: Ebofs *fs; + interval_set limbo; + static int pick_bucket(block_t num) { int b = 0; while (num > 1) { @@ -29,6 +33,9 @@ class Allocator { int allocate(Extent& ex, block_t num, block_t near=0); int release(Extent& ex); + int release_now(Extent& ex); + + int release_limbo(); }; #endif diff --git a/ceph/ebofs/BlockDevice.cc b/ceph/ebofs/BlockDevice.cc index af8bf95ce89..1906f2104fb 100644 --- a/ceph/ebofs/BlockDevice.cc +++ b/ceph/ebofs/BlockDevice.cc @@ -268,7 +268,7 @@ int BlockDevice::cancel_io(ioh_t ioh) // FIXME? if (r == 0 && pbio->context) { - pbio->context->finish(0); + //pbio->context->finish(0); ******HELP!!! delete pbio->context; delete pbio; } diff --git a/ceph/ebofs/BufferCache.cc b/ceph/ebofs/BufferCache.cc index 074edf44c1a..19d24daa2c2 100644 --- a/ceph/ebofs/BufferCache.cc +++ b/ceph/ebofs/BufferCache.cc @@ -92,7 +92,12 @@ void ObjectCache::tx_finish(ioh_t ioh, block_t start, block_t length, version_t p->second->set_last_flushed(version); bc->mark_clean(p->second); - if (p->second->ioh == ioh) p->second->ioh = 0; + if (p->second->ioh == ioh) { + p->second->ioh = 0; + } + else if (p->second->shadow_ioh == ioh) { + p->second->shadow_ioh = 0; + } // trigger waiters waiters.splice(waiters.begin(), p->second->waitfor_flush); @@ -188,6 +193,12 @@ int ObjectCache::map_read(block_t start, block_t len, return 0; } + +/* + * map a range of pages on an object's buffer cache. + * + * - break up bufferheads that don't fall completely within the range + */ int ObjectCache::map_write(block_t start, block_t len, map& hits) { diff --git a/ceph/ebofs/BufferCache.h b/ceph/ebofs/BufferCache.h index aa62cda96d7..2d13648fb75 100644 --- a/ceph/ebofs/BufferCache.h +++ b/ceph/ebofs/BufferCache.h @@ -27,7 +27,12 @@ class BufferHead : public LRUObject { public: ObjectCache *oc; - bufferlist data; + bufferlist data, shadow_data; + ioh_t ioh, shadow_ioh; // any pending read/write op + version_t tx_epoch; // epoch this write is in + + list waitfor_read; + list waitfor_flush; private: map partial; // partial dirty content overlayed onto incoming data @@ -36,20 +41,16 @@ class BufferHead : public LRUObject { int state; version_t version; // current version in cache version_t last_flushed; // last version flushed to disk - + Extent object_loc; // block position _in_object_ utime_t dirty_stamp; - public: - ioh_t ioh; // any pending read/write op - - list waitfor_read; - list waitfor_flush; - public: BufferHead(ObjectCache *o) : - oc(o), ref(0), state(STATE_MISSING), version(0), last_flushed(0), ioh(0) {} + oc(o), ioh(0), shadow_ioh(0), tx_epoch(0), + ref(0), state(STATE_MISSING), version(0), last_flushed(0) + {} ObjectCache *get_oc() { return oc; } @@ -337,10 +338,9 @@ public: class BufferCache { public: - Mutex lock; - - BlockDevice& dev; - AlignedBufferPool& bufferpool; + Mutex &lock; // hack: ref to global lock + BlockDevice &dev; + AlignedBufferPool &bufferpool; set dirty_bh; @@ -358,7 +358,8 @@ class BufferCache { off_t stat_missing; public: - BufferCache(BlockDevice& d, AlignedBufferPool& bp) : dev(d), bufferpool(bp), + BufferCache(BlockDevice& d, AlignedBufferPool& bp, Mutex& glock) : + lock(glock), dev(d), bufferpool(bp), stat_waiter(0), stat_clean(0), stat_dirty(0), stat_rx(0), stat_tx(0), stat_partial(0), stat_missing(0) {} diff --git a/ceph/ebofs/Ebofs.cc b/ceph/ebofs/Ebofs.cc index 96b55aa0290..77b4471d09a 100644 --- a/ceph/ebofs/Ebofs.cc +++ b/ceph/ebofs/Ebofs.cc @@ -8,49 +8,65 @@ int Ebofs::mount() { + // note: this will fail in mount -> unmount -> mount type situations, bc + // prior state isn't fully cleaned up. + + ebofs_lock.Lock(); assert(!mounted); // read super - bufferptr bp1, bp2; + bufferptr bp1 = bufferpool.alloc(EBOFS_BLOCK_SIZE); + bufferptr bp2 = bufferpool.alloc(EBOFS_BLOCK_SIZE); dev.read(0, 1, bp1); dev.read(1, 1, bp2); struct ebofs_super *sb1 = (struct ebofs_super*)bp1.c_str(); struct ebofs_super *sb2 = (struct ebofs_super*)bp2.c_str(); - struct ebofs_super *sb = 0; + dout(2) << "mount super @0 epoch " << sb1->epoch << endl; + dout(2) << "mount super @1 epoch " << sb2->epoch << endl; // pick newest super - dout(2) << "mount super @0 v " << sb1->version << endl; - dout(2) << "mount super @1 v " << sb2->version << endl; - if (sb1->version > sb2->version) + struct ebofs_super *sb = 0; + if (sb1->epoch > sb2->epoch) sb = sb1; else sb = sb2; - - super_version = sb->version; + super_epoch = sb->epoch; + dout(2) << "mount epoch " << super_epoch << endl; + assert(super_epoch == sb->epoch); // init node pools - dout(2) << "mount table nodepool" << endl; - table_nodepool.read( dev, &sb->table_nodepool ); + dout(2) << "mount nodepool" << endl; + nodepool.init( &sb->nodepool ); + nodepool.read_usemap( dev, super_epoch ); + nodepool.read_clean_nodes( dev ); // open tables dout(2) << "mount opening tables" << endl; - object_tab = new Table( table_nodepool, sb->object_tab ); + object_tab = new Table( nodepool, sb->object_tab ); for (int i=0; i( table_nodepool, sb->free_tab[i] ); - - collection_tab = new Table( table_nodepool, sb->collection_tab ); - oc_tab = new Table( table_nodepool, sb->oc_tab ); - co_tab = new Table( table_nodepool, sb->co_tab ); - + free_tab[i] = new Table( nodepool, sb->free_tab[i] ); + + collection_tab = new Table( nodepool, sb->collection_tab ); + oc_tab = new Table( nodepool, sb->oc_tab ); + co_tab = new Table( nodepool, sb->co_tab ); + + dout(2) << "mount starting commit thread" << endl; + commit_thread.create(); + dout(2) << "mount mounted" << endl; mounted = true; + + ebofs_lock.Unlock(); return 0; } int Ebofs::mkfs() { + ebofs_lock.Lock(); + assert(!mounted); + block_t num_blocks = dev.get_num_blocks(); // create first noderegion @@ -58,89 +74,110 @@ int Ebofs::mkfs() nr.start = 2; nr.length = num_blocks / 10000; if (nr.length < 10) nr.length = 10; - NodeRegion *r = new NodeRegion(0,nr); - table_nodepool.add_region(r); - table_nodepool.init_all_free(); + nodepool.add_region(nr); dout(1) << "mkfs: first node region at " << nr << endl; + + // allocate two usemaps + block_t usemap_len = ((nr.length-1) / 8 / EBOFS_BLOCK_SIZE) + 1; + nodepool.usemap_even.start = nr.end(); + nodepool.usemap_even.length = usemap_len; + nodepool.usemap_odd.start = nodepool.usemap_even.end(); + nodepool.usemap_odd.length = usemap_len; + dout(1) << "mkfs: even usemap at " << nodepool.usemap_even << endl; + dout(1) << "mkfs: odd usemap at " << nodepool.usemap_odd << endl; // init tables struct ebofs_table empty; empty.num_keys = 0; empty.root = -1; empty.depth = 0; - - object_tab = new Table( table_nodepool, empty ); - collection_tab = new Table( table_nodepool, empty ); + + object_tab = new Table( nodepool, empty ); + collection_tab = new Table( nodepool, empty ); for (int i=0; i( table_nodepool, empty ); - - oc_tab = new Table( table_nodepool, empty ); - co_tab = new Table( table_nodepool, empty ); + free_tab[i] = new Table( nodepool, empty ); + + oc_tab = new Table( nodepool, empty ); + co_tab = new Table( nodepool, empty ); // add free space Extent left; - left.start = nr.start + nr.length; + left.start = nodepool.usemap_odd.end(); left.length = num_blocks - left.start; dout(1) << "mkfs: free blocks at " << left << endl; - allocator.release( left ); + allocator.release_now( left ); - // write nodes - dout(1) << "mkfs: flushing nodepool" << endl; - table_nodepool.induce_full_flush(); - table_nodepool.flush( dev ); + // write nodes, super, 2x + dout(1) << "mkfs: flushing nodepool and superblocks (2x)" << endl; + + nodepool.commit_start( dev, 0 ); + nodepool.commit_wait(); + bufferptr superbp0; + prepare_super(0, superbp0); + write_super(0, superbp0); - // write super (2x) - dout(1) << "mkfs: writing superblocks" << endl; - super_version = 0; - write_super(); - write_super(); + nodepool.commit_start( dev, 1 ); + nodepool.commit_wait(); + bufferptr superbp1; + prepare_super(1, superbp1); + write_super(1, superbp1); - mounted = true; + // free memory + dout(1) << "mkfs: cleaning up" << endl; + close_tables(); + + dout(1) << "mkfs: done" << endl; + + ebofs_lock.Unlock(); return 0; } -int Ebofs::umount() +void Ebofs::close_tables() { - mounted = false; - - // wait - - // flush - dout(1) << "umount: flushing nodepool" << endl; - table_nodepool.flush( dev ); - - // super - dout(1) << "umount: writing superblocks" << endl; - write_super(); - // close tables delete object_tab; for (int i=0; iget_depth(); // pools - sb.table_nodepool.num_regions = table_nodepool.get_num_regions(); - for (int i=0; iget_attr_bytes() + on->get_extent_bytes(); unsigned blocks = (bytes-1)/EBOFS_BLOCK_SIZE + 1; bufferlist bl; bufferpool.alloc( EBOFS_BLOCK_SIZE*blocks, bl ); - // place on disk - if (on->onode_loc.length < blocks) { - // relocate onode! + // (always) relocate onode + if (1) { if (on->onode_loc.length) allocator.release(on->onode_loc); - + block_t first = 0; if (on->extents.size()) first = on->extents[0].start; @@ -276,7 +374,7 @@ void Ebofs::write_onode(Onode *on) object_tab->remove( on->object_id ); object_tab->insert( on->object_id, on->onode_loc ); } - + struct ebofs_onode eo; eo.onode_loc = on->onode_loc; eo.object_id = on->object_id; @@ -306,7 +404,7 @@ void Ebofs::write_onode(Onode *on) } // write - dev.write( on->onode_loc.start, on->onode_loc.length, bl ); + dev.write( on->onode_loc.start, on->onode_loc.length, bl, c ); } void Ebofs::remove_onode(Onode *on) @@ -330,7 +428,13 @@ void Ebofs::put_onode(Onode *on) on->put(); } - +void Ebofs::dirty_onode(Onode *on) +{ + if (!on->is_dirty()) { + on->mark_dirty(); + dirty_onodes.insert(on); + } +} void Ebofs::trim_onode_cache() { @@ -408,21 +512,20 @@ Cnode* Ebofs::get_cnode(object_t cid) return cn; } -void Ebofs::write_cnode(Cnode *cn) +void Ebofs::write_cnode(Cnode *cn, Context *c) { - // allocate + // allocate buffer int bytes = sizeof(ebofs_cnode) + cn->get_attr_bytes(); unsigned blocks = (bytes-1)/EBOFS_BLOCK_SIZE + 1; - + bufferlist bl; bufferpool.alloc( EBOFS_BLOCK_SIZE*blocks, bl ); - // place on disk - if (cn->cnode_loc.length < blocks) { - // relocate cnode! + // (always) relocate cnode! + if (1) { if (cn->cnode_loc.length) allocator.release(cn->cnode_loc); - + allocator.allocate(cn->cnode_loc, blocks, 0); collection_tab->remove( cn->coll_id ); collection_tab->insert( cn->coll_id, cn->cnode_loc ); @@ -449,7 +552,7 @@ void Ebofs::write_cnode(Cnode *cn) } // write - dev.write( cn->cnode_loc.start, cn->cnode_loc.length, bl ); + dev.write( cn->cnode_loc.start, cn->cnode_loc.length, bl, c ); } void Ebofs::remove_cnode(Cnode *cn) @@ -472,11 +575,73 @@ void Ebofs::put_cnode(Cnode *cn) cn->put(); } +void Ebofs::dirty_cnode(Cnode *cn) +{ + if (!cn->is_dirty()) { + cn->mark_dirty(); + dirty_cnodes.insert(cn); + } +} +class C_E_InodeFlush : public Context { + Ebofs *ebofs; +public: + C_E_InodeFlush(Ebofs *e) : ebofs(e) {} + void finish(int r) { + ebofs->flush_inode_finish(); + } +}; +void Ebofs::flush_inode_finish() +{ + ebofs_lock.Lock(); + inodes_flushing--; + if (inodes_flushing == 0) + inode_commit_cond.Signal(); + ebofs_lock.Unlock(); +} + +void Ebofs::commit_inodes_start() +{ + dout(10) << "commit_inodes_start" << endl; + + assert(inodes_flushing == 0); + + // onodes + for (set::iterator i = dirty_onodes.begin(); + i != dirty_onodes.end(); + i++) { + Onode *on = *i; + inodes_flushing++; + write_onode(on, new C_E_InodeFlush(this)); + on->mark_clean(); + } + + // cnodes + for (set::iterator i = dirty_cnodes.begin(); + i != dirty_cnodes.end(); + i++) { + Cnode *cn = *i; + inodes_flushing++; + write_cnode(cn, new C_E_InodeFlush(this)); + cn->mark_clean(); + } + + dout(10) << "commit_inodes_start writing " << inodes_flushing << " onodes+cnodes" << endl; +} + +void Ebofs::commit_inodes_wait() +{ + // caller must hold ebofs_lock + while (inodes_flushing > 0) { + dout(10) << "commit_inodes_wait for " << inodes_flushing << " onodes+cnodes to flush" << endl; + inode_commit_cond.Wait(ebofs_lock); + } + dout(10) << "commit_inodes_wait all flushed" << endl; +} @@ -486,9 +651,10 @@ void Ebofs::put_cnode(Cnode *cn) // *** buffer cache *** +// ... should already hold lock ... void Ebofs::trim_buffer_cache() { - bc.lock.Lock(); + //ebofs_lock.Lock(); // flush any dirty items? while (bc.lru_dirty.lru_get_size() > bc.lru_dirty.lru_get_max()) { @@ -528,10 +694,16 @@ void Ebofs::trim_buffer_cache() << bc.lru_rest.lru_get_size() << " rest + " << bc.lru_dirty.lru_get_size() << " dirty " << endl; - bc.lock.Unlock(); + //ebofs_lock.Unlock(); } + +void Ebofs::commit_bc_wait(version_t epoch) +{ + dout(1) << "commit_bc_wait" << endl; +} + void Ebofs::flush_all() { // FIXME what about partial heads? @@ -627,6 +799,7 @@ void Ebofs::bh_write(Onode *on, BufferHead *bh) assert(bh->is_dirty()); bc.mark_tx(bh); + bh->tx_epoch = super_epoch; // note the epoch! // get extents vector ex; @@ -651,6 +824,101 @@ void Ebofs::bh_write(Onode *on, BufferHead *bh) } +/* + * allocate a write to blocks on disk. + * take care to not overwrite any "safe" data blocks. + * break up bufferheads in bh_hits that span realloc boundaries. + * final bufferhead set stored in final! + */ +void Ebofs::alloc_write(Onode *on, + block_t start, block_t len, + map& hits) +{ + // first decide what pages to (re)allocate + interval_set alloc; + on->map_alloc_regions(start, len, alloc); + + dout(10) << "alloc_write need to alloc " << alloc << endl; + + // merge alloc into onode uncommitted map + cout << "union of " << on->uncommitted << " and " << alloc << endl; + on->uncommitted.union_of(alloc); + + dout(10) << "alloc_write onode uncommitted now " << on->uncommitted << endl; + + // allocate the space + for (map::iterator i = alloc.m.begin(); + i != alloc.m.end(); + i++) { + // get old region + vector old; + on->map_extents(i->first, i->second, old); + for (unsigned o=0; osecond; + block_t cur = i->first; + while (left > 0) { + Extent ex; + allocator.allocate(ex, left, 0); + dout(10) << "alloc_write got " << ex << " for object offset " << cur << endl; + on->set_extent(cur, ex); // map object to new region + left -= ex.length; + cur += ex.length; + } + } + + // now break up the bh's as necessary + block_t cur = start; + block_t left = len; + + map::iterator bhp = hits.begin(); + map::iterator ap = alloc.m.begin(); + + block_t aoff = 0; + while (left > 0) { + assert(cur == bhp->first); + BufferHead *bh = bhp->second; + assert(cur == bh->start()); + assert(left >= bh->length()); + + assert(ap->first+aoff == bh->start()); + if (ap->second-aoff == bh->length()) { + // perfect. + cur += bh->length(); + left -= bh->length(); + ap++; + aoff = 0; + bhp++; + continue; + } + + if (bh->length() < ap->second-aoff) { + // bh is within alloc range. + cur += bh->length(); + left -= bh->length(); + aoff += bh->length(); + bhp++; + continue; + } + + // bh spans alloc boundary, split it! + assert(bh->length() > ap->second - aoff); + BufferHead *n = bc.split(bh, bh->start() + ap->second-aoff); + hits[n->start()] = n; // add new guy to hit map + + // bh is now shortened... + cur += bh->length(); + left -= bh->length(); + assert(ap->second == aoff + bh->length()); + aoff = 0; + ap++; + continue; + } +} + + void Ebofs::apply_write(Onode *on, size_t len, off_t off, bufferlist& bl) { @@ -673,7 +941,7 @@ void Ebofs::apply_write(Onode *on, size_t len, off_t off, bufferlist& bl) dout(10) << "apply_write extending object size " << on->object_size << " -> " << off+len << endl; on->object_size = off+len; - on->mark_dirty(); + dirty_onode(on); } if (zleft) dout(10) << "apply_write zeroing first " << zleft << " bytes" << endl; @@ -681,11 +949,13 @@ void Ebofs::apply_write(Onode *on, size_t len, off_t off, bufferlist& bl) block_t blast = (len+off-1) / EBOFS_BLOCK_SIZE; block_t blen = blast-bstart+1; - bc.lock.Lock(); - + // map b range onto buffer_heads map hits; oc->map_write(bstart, blen, hits); - + + // allocate write on disk. break buffer_heads across realloc/no realloc boundaries + alloc_write(on, bstart, blen, hits); + // get current versions version_t lowv, highv; oc->scan_versions(bstart, blen, lowv, highv); @@ -702,10 +972,20 @@ void Ebofs::apply_write(Onode *on, size_t len, off_t off, bufferlist& bl) bh->set_version(highv+1); // cancel old io? - if (bh->ioh) { - dout(10) << "apply_write canceling old io on " << *bh << endl; - bc.dev.cancel_io( bh->ioh ); - bh->ioh = 0; + if (bh->is_tx()) { + if (bh->tx_epoch == super_epoch) { + // try to cancel the old io (just bc it's a waste) + dout(10) << "apply_write canceling old io on " << *bh << endl; + bc.dev.cancel_io(bh->ioh); + bh->ioh = 0; + } else { + // this tx is from prior epoch! shadow+copy the buffer before we modify it. + bh->shadow_data.claim(bh->data); + bc.bufferpool.alloc(EBOFS_BLOCK_SIZE*bh->length(), bh->data); // new buffers! + bh->data.copy_in(0, bh->length()*EBOFS_BLOCK_SIZE, bh->shadow_data); + bh->shadow_ioh = bh->ioh; + bh->ioh = 0; + } } // partial at head or tail? @@ -753,6 +1033,7 @@ void Ebofs::apply_write(Onode *on, size_t len, off_t off, bufferlist& bl) bc.dev.cancel_io( bh->ioh ); bh->ioh = 0; } + bh_write(on, bh); } else if (bh->is_rx()) { dout(10) << "apply_write rx -> partial " << *bh << endl; @@ -793,8 +1074,10 @@ void Ebofs::apply_write(Onode *on, size_t len, off_t off, bufferlist& bl) left -= len_in_bh-z; opos += len_in_bh-z; - if (!bh->is_dirty()) + if (!bh->is_dirty()) bc.mark_dirty(bh); + + bh_write(on, bh); } continue; } @@ -834,13 +1117,14 @@ void Ebofs::apply_write(Onode *on, size_t len, off_t off, bufferlist& bl) // mark dirty if (!bh->is_dirty()) bc.mark_dirty(bh); + + bh_write(on, bh); } + assert(zleft == 0); assert(left == 0); assert(opos == off+len); //assert(blpos == bl.length()); - - bc.lock.Unlock(); } @@ -967,31 +1251,34 @@ int Ebofs::read(object_t oid, size_t len, off_t off, bufferlist& bl) { + ebofs_lock.Lock(); Onode *on = get_onode(oid); - if (!on) + if (!on) { + ebofs_lock.Unlock(); return -1; // object dne? + } // read data into bl. block as necessary. Cond cond; - bc.lock.Lock(); while (1) { // check size bound - if (off >= on->object_size) { - put_onode(on); + if (off >= on->object_size) break; - } - size_t will_read = MIN( off+len, on->object_size ) - off; + + size_t will_read = MIN(off+len, on->object_size) - off; if (attempt_read(on, will_read, off, bl, &cond)) break; // yay // wait - cond.Wait(bc.lock); + cond.Wait(ebofs_lock); } - bc.lock.Unlock(); - + + put_onode(on); + + ebofs_lock.Unlock(); return 0; } @@ -1000,71 +1287,70 @@ int Ebofs::write(object_t oid, size_t len, off_t off, bufferlist& bl, bool fsync) { - // FIXME - return write(oid, len, off, bl, (Context*)0); + // wait? + if (fsync) { + // wait for flush. + + // FIXME. wait on a Cond or whatever! be careful about ebofs_lock. + + return write(oid, len, off, bl, (Context*)0); + } else { + // don't wait. + return write(oid, len, off, bl, (Context*)0); + } } int Ebofs::write(object_t oid, size_t len, off_t off, bufferlist& bl, Context *onflush) { + ebofs_lock.Lock(); + assert(len > 0); // get inode Onode *on = get_onode(oid); if (!on) on = new_onode(oid); // new inode! - - // allocate more space? - block_t bnum = (len+off-1) / EBOFS_BLOCK_SIZE + 1; - if (bnum > on->object_blocks) { - block_t need = bnum - on->object_blocks; - block_t near = 0; - if (on->extents.size()) - near = on->extents[on->extents.size()-1].end(); - - while (need > 0) { - Extent ex; - allocator.allocate(ex, need, near); - dout(10) << "apply_write allocated " << ex << " near " << near << endl; - on->extents.push_back(ex); - on->object_blocks += ex.length; - need -= ex.length; - near = ex.end(); - } - } - // apply to buffer cache + // apply write to buffer cache apply_write(on, len, off, bl); - - - // attr changes + // apply attribute changes // *** + // prepare (eventual) journal entry. + // set up onfinish waiter if (onflush) { - } // done put_onode(on); + ebofs_lock.Unlock(); return 0; } int Ebofs::remove(object_t oid) { + ebofs_lock.Lock(); + // get inode Onode *on = get_onode(oid); - if (!on) return -1; + if (!on) { + ebofs_lock.Unlock(); + return -1; + } // FIXME locking, buffer, flushing etc. assert(0); remove_onode(on); + + ebofs_lock.Unlock(); return 0; } @@ -1077,23 +1363,28 @@ int Ebofs::truncate(object_t oid, off_t size) bool Ebofs::exists(object_t oid) { + ebofs_lock.Lock(); Onode *on = get_onode(oid); - if (!on) - return false; - put_onode(on); - return true; + if (on) put_onode(on); + ebofs_lock.Unlock(); + return on ? true:false; } int Ebofs::stat(object_t oid, struct stat *st) { + ebofs_lock.Lock(); + Onode *on = get_onode(oid); - if (!on) + if (!on) { + ebofs_lock.Unlock(); return -1; + } // ?? st->st_size = on->object_size; put_onode(on); + ebofs_lock.Unlock(); return 0; } @@ -1107,7 +1398,7 @@ int Ebofs::setattr(object_t oid, const char *name, void *value, size_t size) string n(name); AttrVal val((char*)value, size); on->attr[n] = val; - on->mark_dirty(); + dirty_onode(on); put_onode(on); return 0; @@ -1122,7 +1413,7 @@ int Ebofs::getattr(object_t oid, const char *name, void *value, size_t size) if (on->attr.count(n) == 0) return -1; memcpy(value, on->attr[n].data, MIN( on->attr[n].len, (int)size )); - on->mark_dirty(); + dirty_onode(on); put_onode(on); return 0; } @@ -1135,7 +1426,7 @@ int Ebofs::rmattr(object_t oid, const char *name) string n(name); on->attr.erase(n); - on->mark_dirty(); + dirty_onode(on); put_onode(on); return 0; } @@ -1257,7 +1548,7 @@ int Ebofs::collection_setattr(coll_t cid, const char *name, void *value, size_t string n(name); AttrVal val((char*)value, size); cn->attr[n] = val; - cn->mark_dirty(); + dirty_cnode(cn); put_cnode(cn); return 0; @@ -1272,7 +1563,6 @@ int Ebofs::collection_getattr(coll_t cid, const char *name, void *value, size_t if (cn->attr.count(n) == 0) return -1; memcpy(value, cn->attr[n].data, MIN( cn->attr[n].len, (int)size )); - cn->mark_dirty(); put_cnode(cn); return 0; } @@ -1285,7 +1575,7 @@ int Ebofs::collection_rmattr(coll_t cid, const char *name) string n(name); cn->attr.erase(n); - cn->mark_dirty(); + dirty_cnode(cn); put_cnode(cn); return 0; } diff --git a/ceph/ebofs/Ebofs.h b/ceph/ebofs/Ebofs.h index 8702da0e851..4830a8ad8c9 100644 --- a/ceph/ebofs/Ebofs.h +++ b/ceph/ebofs/Ebofs.h @@ -26,16 +26,35 @@ inline ostream& operator<<(ostream& out, idpair_t oc) { } +const int EBOFS_COMMIT_INTERVAL = 10; // whatever + + class Ebofs : public ObjectStore { protected: Mutex ebofs_lock; // a beautiful global lock // ** super ** - bool mounted; BlockDevice &dev; - version_t super_version; + bool mounted, unmounting; + bool readonly; + version_t super_epoch; + + void prepare_super(version_t epoch, bufferptr& bp); + void write_super(version_t epoch, bufferptr& bp); + + Cond commit_cond; // to wake up the commit thread + int commit_thread_entry(); + + class CommitThread : public Thread { + Ebofs *ebofs; + public: + CommitThread(Ebofs *e) : ebofs(e) {} + void *entry() { + ebofs->commit_thread_entry(); + return 0; + } + } commit_thread; - int write_super(); // ** allocator ** block_t free_blocks; @@ -48,38 +67,50 @@ class Ebofs : public ObjectStore { // ** tables and sets ** // nodes - NodePool table_nodepool; // for primary tables. + NodePool nodepool; // for all tables... // tables - Table *object_tab; - Table *free_tab[EBOFS_NUM_FREE_BUCKETS]; + Table *object_tab; + Table *free_tab[EBOFS_NUM_FREE_BUCKETS]; // collections - Table *collection_tab; + Table *collection_tab; Table *oc_tab; Table *co_tab; + void close_tables(); + // ** onode cache ** hash_map onode_map; // onode cache LRU onode_lru; + set dirty_onodes; Onode* new_onode(object_t oid); // make new onode. ref++. Onode* get_onode(object_t oid); // get cached onode, or read from disk. ref++. - void write_onode(Onode *on); + void write_onode(Onode *on, Context *c); void remove_onode(Onode *on); void put_onode(Onode* o); // put it back down. ref--. + void dirty_onode(Onode* o); // ** cnodes ** hash_map cnode_map; LRU cnode_lru; + set dirty_cnodes; + int inodes_flushing; + Cond inode_commit_cond; Cnode* new_cnode(coll_t cid); Cnode* get_cnode(coll_t cid); - void write_cnode(Cnode *cn); + void write_cnode(Cnode *cn, Context *c); void remove_cnode(Cnode *cn); void put_cnode(Cnode *cn); + void dirty_cnode(Cnode *cn); + void flush_inode_finish(); + void commit_inodes_start(); + void commit_inodes_wait(); + friend class C_E_InodeFlush; public: void trim_onode_cache(); @@ -89,12 +120,17 @@ class Ebofs : public ObjectStore { BufferCache bc; pthread_t flushd_thread_id; + void commit_bc_wait(version_t epoch); + public: void trim_buffer_cache(); void flush_all(); protected: - void zero(Onode *on, size_t len, off_t off, off_t write_thru); + //void zero(Onode *on, size_t len, off_t off, off_t write_thru); + void alloc_write(Onode *on, + block_t start, block_t len, + map& hits); void apply_write(Onode *on, size_t len, off_t off, bufferlist& bl); bool attempt_read(Onode *on, size_t len, off_t off, bufferlist& bl, Cond *will_wait_on); @@ -111,11 +147,14 @@ class Ebofs : public ObjectStore { public: Ebofs(BlockDevice& d) : - dev(d), + dev(d), + mounted(false), unmounting(false), readonly(false), super_epoch(0), + commit_thread(this), free_blocks(0), allocator(this), bufferpool(EBOFS_BLOCK_SIZE), object_tab(0), collection_tab(0), oc_tab(0), co_tab(0), - bc(dev, bufferpool) { + inodes_flushing(0), + bc(dev, bufferpool, ebofs_lock) { for (int i=0; i attr; vector extents; + interval_set uncommitted; + ObjectCache *oc; bool dirty; @@ -94,6 +99,81 @@ public: // allocation + void _append_extent(Extent& ex) { + if (extents.size() && + extents[extents.size()-1].end() == ex.start) + extents[extents.size()-1].length += ex.length; + else + extents.push_back(ex); + } + void set_extent(block_t offset, Extent ex) { + assert(offset <= object_blocks); + + // at the end? + if (offset == object_blocks) { + _append_extent(ex); + object_blocks += ex.length; + return; + } + + // nope. ok, rebuild the extent list. + vector old; + old.swap(extents); + assert(extents.empty()); + + unsigned oldex = 0; + block_t oldoff = 0; + block_t cur = 0; + + // copy up to offset + while (cur < offset) { + Extent t; + t.start = old[oldex].start+oldoff; + t.length = MIN(offset-cur, old[oldex].length-oldoff); + _append_extent(t); + + oldoff += t.length; + if (oldoff == old[oldex].length) { + oldex++; + oldoff = 0; + } + } + + // add our new extent + _append_extent(ex); + if (offset + ex.length > object_blocks) + object_blocks = offset + ex.length; + + // skip past it in the old stuff + block_t sleft = ex.length; + while (sleft > 0) { + block_t skip = MIN(ex.length, old[oldex].length-oldoff); + sleft -= skip; + oldoff += skip; + if (oldoff == old[oldex].length) { + oldex++; + oldoff = 0; + if (oldex == old.size()) break; + } + } + + // copy anything left? + if (oldex < old.size()) { + if (oldoff) { + Extent t; + t.start = old[oldex].start+oldoff; + t.length = old[oldex].length-oldoff; + _append_extent(t); + } else { + _append_extent(old[oldex++]); + } + } + } + + + /* map_extents(start, len, ls) + * map teh given page range into extents on disk. + */ int map_extents(block_t start, block_t len, vector& ls) { block_t cur = 0; for (unsigned i=0; i& alloc) { + interval_set already_uncom; + + alloc.insert(start, len); // start with whole range + already_uncom.intersection_of(alloc, uncommitted); + alloc.subtract(already_uncom); // take out the bits that aren't yet committed } - int setxattr(char *name, void *value, int size) { - return 0; + + + + /* + // (un)committed ranges + void dirty_range(block_t start, block_t len) { + // frame affected area (used for simplify later) + block_t first = start; + block_t last = start+len; + + // put in uncommitted range + map::iterator p = uncommitted.lower_bound(start); + if (p != uncommitted.begin() && + (p->first > start || p == uncommitted.end())) { + p--; + first = p->first; + if (p->first + p->second <= start) + p++; + } + + for (; len>0; p++) { + if (p == uncommitted.end()) { + uncommitted[start] = len; + break; + } + + if (p->first <= start) { + block_t skip = start - p->first; + block_t overlap = MIN( p->second - skip, len ); + start += overlap; + len -= overlap; + } else if (p->first > start) { + block_t add = MIN(len, p->first - start); + uncommitted[start] = add; + start += add; + len -= add; + } + } + + // simplify uncommitted + map::iterator p = uncommitted.lower_bound(first); + block_t prevstart = p->first; + block_t prevend = p->first + p->second; + p++; + while (p != uncommitted.end()) { + if (prevend == p->first) { + uncommitted[prevstart] += p->second; + prevend += p->second; + block_t t = p->first; + p++; + uncommitted.erase(t); + } else { + prevstart = p->first; + prevend = p->first + p->second; + if (prevend >= last) break; // we've gone past our updates + p++; + } + } } - int removexattr(char *name) { - return 0; + void mark_committed() { + uncommitted.clear(); } + bool is_uncommitted(block_t b) { + + } + */ + // pack/unpack int get_attr_bytes() { diff --git a/ceph/ebofs/Table.h b/ceph/ebofs/Table.h index 7dbb62c82d6..d0812841e41 100644 --- a/ceph/ebofs/Table.h +++ b/ceph/ebofs/Table.h @@ -473,8 +473,7 @@ class Table : public _Table { // create a root node (leaf!) assert(root == -1); assert(depth == 0); - Nodeptr newroot( pool.new_node() ); - newroot.set_type(Node::TYPE_LEAF); + Nodeptr newroot( pool.new_node(Node::TYPE_LEAF) ); root = newroot.get_id(); depth++; } @@ -529,9 +528,8 @@ class Table : public _Table { cursor.dirty(); // split - Nodeptr newnode( pool.new_node() ); Nodeptr leftnode = cursor.open[cursor.level]; - newnode.set_type( leftnode.node->get_type() ); + Nodeptr newnode( pool.new_node(leftnode.node->get_type()) ); leftnode.split( newnode ); /* insert our item */ @@ -558,10 +556,9 @@ class Table : public _Table { if (cursor.level == 0) { /* split root. */ dbtout << "that split was the root " << root << endl; - Nodeptr newroot( pool.new_node() ); + Nodeptr newroot( pool.new_node(Node::TYPE_INDEX) ); /* new root node */ - newroot.set_type(Node::TYPE_INDEX); newroot.set_size(2); newroot.index_item(0).key = leftnode.key(0); newroot.index_item(0).node = root; diff --git a/ceph/ebofs/mkfs.ebofs.cc b/ceph/ebofs/mkfs.ebofs.cc index 0946b97acdb..733e202f2ca 100644 --- a/ceph/ebofs/mkfs.ebofs.cc +++ b/ceph/ebofs/mkfs.ebofs.cc @@ -18,8 +18,12 @@ int main(int argc, char **argv) } // mkfs + Ebofs mfs(dev); + mfs.mkfs(); + + // test-o-rama! Ebofs fs(dev); - fs.mkfs(); + fs.mount(); if (0) { // test bufferlist bl; diff --git a/ceph/ebofs/nodes.h b/ceph/ebofs/nodes.h index b12f50f65f8..d9c54e5fb41 100644 --- a/ceph/ebofs/nodes.h +++ b/ceph/ebofs/nodes.h @@ -8,32 +8,34 @@ #include "AlignedBufferPool.h" -/* node status on disk, in memory - * - * DISK MEMORY ON LISTS EVENT - * - * claim cycle: - * free free free - - * free dirty dirty mark_dirty() - * free dirty committing start commit - * inuse dirty committing (write happens) - * inuse clean - finish commit - * - * release cycle: - * inuse clean - - - * inuse free limbo release - * inuse free committing start_write - * free free committing (write happens) - * free free free finish_write - */ +/* + disk wire memory + + free free -> free can alloc + free used -> dirty can modify + + free used used -> tx + free used free -> limbo + + used used -> clean + used free -> limbo + + + // meaningless + used free free -> free can alloc + used free used __DNE__ + + +*/ class Node { public: - static const int STATUS_FREE = 0; - static const int STATUS_DIRTY = 1; - static const int STATUS_CLEAN = 2; + // bit fields + static const int STATE_CLEAN = 1; + static const int STATE_DIRTY = 2; + static const int STATE_TX = 3; static const int ITEM_LEN = EBOFS_NODE_BYTES - sizeof(int) - sizeof(int) - sizeof(int); @@ -42,55 +44,91 @@ class Node { protected: nodeid_t id; + int state; // use bit fields above! + bufferptr bptr; - int *nrecs; - int *status; + bufferptr shadow_bptr; + + // in disk buffer int *type; + int *nrecs; public: - Node(nodeid_t i) : id(i) { - bptr = new buffer(EBOFS_NODE_BYTES); + Node(nodeid_t i, bufferptr& b, int s) : id(i), state(s), bptr(b) { nrecs = (int*)(bptr.c_str()); - status = (int*)(bptr.c_str() + sizeof(*nrecs)); - type = (int*)(bptr.c_str() + sizeof(*status) + sizeof(*nrecs)); - clear(); - } - Node(nodeid_t i, bufferptr& b) : id(i), bptr(b) { - nrecs = (int*)(bptr.c_str()); - status = (int*)(bptr.c_str() + sizeof(*nrecs)); - type = (int*)(bptr.c_str() + sizeof(*status) + sizeof(*nrecs)); + type = (int*)(bptr.c_str() + sizeof(*nrecs)); } - void clear() { - *nrecs = 0; - *status = STATUS_FREE; - *type = 0; - } + // id nodeid_t get_id() const { return id; } void set_id(nodeid_t n) { id = n; } + // buffer bufferptr& get_buffer() { return bptr; } + char *item_ptr() { return bptr.c_str() + sizeof(*nrecs) + sizeof(*type); } + + // size int size() { return *nrecs; } void set_size(int s) { *nrecs = s; } - char *item_ptr() { return bptr.c_str() + sizeof(*status) + sizeof(*nrecs) + sizeof(*type); } - - int& get_status() { return *status; } - bool is_dirty() const { return *status == STATUS_DIRTY; } - void set_status(int s) { *status = s; } - + // type int& get_type() { return *type; } void set_type(int t) { *type = t; } bool is_index() { return *type == TYPE_INDEX; } bool is_leaf() { return *type == TYPE_LEAF; } + + + // state + bool is_dirty() { return state == STATE_DIRTY; } + bool is_tx() { return state == STATE_TX; } + bool is_clean() { return state == STATE_CLEAN; } + + void set_state(int s) { state = s; } + + void make_shadow(AlignedBufferPool& bufferpool) { + assert(is_tx()); + + shadow_bptr = bptr; + + // new buffer + bptr = bufferpool.alloc(EBOFS_NODE_BYTES); + nrecs = (int*)(bptr.c_str()); + type = (int*)(bptr.c_str() + sizeof(*nrecs)); + + // copy contents! + memcpy(bptr.c_str(), shadow_bptr.c_str(), EBOFS_NODE_BYTES); + } + }; -class NodeRegion { + + +class NodePool { + protected: + AlignedBufferPool bufferpool; // our own memory allocator for node buffers + + map node_map; // open node map + public: + vector region_loc; // region locations + Extent usemap_even; + Extent usemap_odd; + + protected: + // on-disk block states + set free; + set dirty; + set tx; + set clean; // aka used + set limbo; + + Mutex lock; + Cond commit_cond; + int flushing; static int make_nodeid(int region, int offset) { return (region << 24) | offset; @@ -101,219 +139,276 @@ class NodeRegion { static int nodeid_offset(nodeid_t nid) { return nid & (0xffffffffUL >> 24); } - - protected: - int region_id; - int num_nodes; + public: - Extent location; - - // free -> dirty -> committing -> clean - // dirty -> free - // or !dirty -> limbo -> free - set free; - set dirty; - set committing; - set limbo; - - public: - NodeRegion(int id, Extent& loc) : region_id(id) { - location = loc; - num_nodes = location.length / EBOFS_NODE_BLOCKS; - } - - int get_region_id() const { return region_id; } - - void init_all_free() { - for (unsigned i=0; i limbo : so they get written out as such - for (set::iterator i = free.begin(); - i != free.end(); - i++) - limbo.insert(*i); - free.clear(); - } - - int size() const { - return num_nodes; - //return (location.length / EBOFS_NODE_BLOCKS); // FIXME THIS IS WRONG - } - int num_free() const { + int num_free() { return free.size(); } - // new/open node - nodeid_t new_nodeid() { - assert(num_free()); - - nodeid_t nid = *(free.begin()); - free.erase(nid); - dirty.insert(nid); - - return nid; - } - - void release(nodeid_t nid) { - if (dirty.count(nid)) { - dirty.erase(nid); - free.insert(nid); - } - else { - if (committing.count(nid)) - committing.erase(nid); - limbo.insert(nid); + // the caller had better adjust usemap locations... + void add_region(Extent ex) { + int region = region_loc.size(); + region_loc.push_back(ex); + for (unsigned o = 0; o < ex.length; o++) { + free.insert( make_nodeid(region, o) ); } } -}; - - - -class NodePool { - protected: - int num_regions; - map node_regions; // regions - map node_map; // open node map - AlignedBufferPool bufferpool; // memory pool - - public: - NodePool() : num_regions(0), bufferpool(EBOFS_NODE_BYTES) {} - ~NodePool() { - // nodes - set left; - for (map::iterator i = node_map.begin(); - i != node_map.end(); - i++) - left.insert(i->second); - for (set::iterator i = left.begin(); - i != left.end(); - i++) - release( *i ); - assert(node_map.empty()); - - // regions - for (map::iterator i = node_regions.begin(); - i != node_regions.end(); - i++) - delete i->second; - node_regions.clear(); - } - - void add_region(NodeRegion *r) { - node_regions[r->get_region_id()] = r; - num_regions++; - } - int get_num_regions() { - return num_regions; - } - Extent& get_region_loc(int r) { - return node_regions[r]->location; - } - - - int num_free() { - int f = 0; - for (map::iterator i = node_regions.begin(); - i != node_regions.end(); - i++) - f += i->second->num_free(); - return f; - } - - // *** - int read(BlockDevice& dev, struct ebofs_nodepool *np) { + int init(struct ebofs_nodepool *np) { + // regions for (int i=0; inum_regions; i++) { - dout(3) << " region " << i << " at " << np->region_loc[i] << endl; - NodeRegion *r = new NodeRegion(i, np->region_loc[i]); - add_region(r); - - for (block_t boff = 0; boff < r->location.length; boff += EBOFS_NODE_BLOCKS) { - nodeid_t nid = NodeRegion::make_nodeid(r->get_region_id(), boff); + dout(3) << "init region " << i << " at " << np->region_loc[i] << endl; + region_loc.push_back( np->region_loc[i] ); + } + + // usemap + usemap_even = np->node_usemap_even; + usemap_odd = np->node_usemap_odd; + dout(3) << "init even map at " << usemap_even << endl; + dout(3) << "init odd map at " << usemap_odd << endl; + + return 0; + } + + + // *** blocking i/o routines *** + + int read_usemap(BlockDevice& dev, version_t epoch) { + // read map + Extent loc; + if (epoch & 1) + loc = usemap_odd; + else + loc = usemap_even; + + bufferptr bp = bufferpool.alloc(EBOFS_BLOCK_SIZE*loc.length); + dev.read(loc.start, loc.length, bp); + + // parse + unsigned region = 0; // current region + unsigned roff = 0; // offset in region + for (unsigned byte = 0; byte> 1; // move one bit right. + roff++; + if (roff == region_loc[region].length) { + // next region! + roff = 0; + region++; + break; + } + } + if (region == region_loc.size()) break; + } + return 0; + } + + int read_clean_nodes(BlockDevice& dev) { + /* + this relies on the clean set begin defined so that we know which nodes + to read. so it only really works when called from mount()! + */ + for (unsigned r=0; rlocation.start + (block_t)boff, EBOFS_NODE_BLOCKS, + dev.read(region_loc[r].start + (block_t)boff, EBOFS_NODE_BLOCKS, bp); - Node *n = new Node(nid, bp); - if (n->get_status() == Node::STATUS_FREE) { - dout(5) << " node " << nid << " free" << endl; - r->free.insert(nid); - delete n; - } else { - dout(5) << " node " << nid << " in use" << endl; - node_map[nid] = n; - } + Node *n = new Node(nid, bp, Node::STATE_CLEAN); + node_map[nid] = n; } } return 0; } - void init_all_free() { - for (map::iterator i = node_regions.begin(); - i != node_regions.end(); - i++) { - i->second->init_all_free(); + + + + // **** non-blocking i/o **** + + private: + class C_NP_FlushUsemap : public Context { + NodePool *pool; + public: + C_NP_FlushUsemap(NodePool *p) : + pool(p) {} + void finish(int r) { + pool->flushed_usemap(); } + }; + + void flushed_usemap() { + lock.Lock(); + flushing--; + if (flushing == 0) + commit_cond.Signal(); + lock.Unlock(); } - void induce_full_flush() { - for (map::iterator i = node_regions.begin(); - i != node_regions.end(); - i++) { - i->second->induce_full_flush(); - } - } + public: + int write_usemap(BlockDevice& dev, version_t version) { + // alloc + Extent loc; + if (version & 1) + loc = usemap_odd; + else + loc = usemap_even; + + bufferptr bp = bufferpool.alloc(EBOFS_BLOCK_SIZE*loc.length); - void flush(BlockDevice& dev) { - // flush dirty items, change them to limbo status - for (map::iterator i = node_regions.begin(); - i != node_regions.end(); - i++) { - NodeRegion *r = i->second; - - // dirty -> clean - r->committing = r->dirty; - r->dirty.clear(); + // fill in + unsigned region = 0; // current region + unsigned roff = 0; // offset in region + for (unsigned byte = 0; byte::iterator i = r->committing.begin(); - i != r->committing.end(); - i++) { - Node *n = get_node(*i); - assert(n); // it's dirty, we better have it - n->set_status(Node::STATUS_CLEAN); - block_t off = NodeRegion::nodeid_offset(*i); - dev.write(r->location.start + off, EBOFS_NODE_BLOCKS, n->get_buffer()); + roff++; + mask = mask >> 1; + if (roff == region_loc[region].length) { + // next region! + roff = 0; + region++; + break; + } } - r->committing.clear(); - // limbo -> free - r->committing = r->limbo; - r->limbo.clear(); - - bufferptr freebuffer = bufferpool.alloc(EBOFS_NODE_BYTES); - Node freenode(1, freebuffer); - freenode.set_status(Node::STATUS_FREE); - for (set::iterator i = r->committing.begin(); - i != r->committing.end(); - i++) { - freenode.set_id( *i ); - block_t off = NodeRegion::nodeid_offset(*i); - dev.write(r->location.start + off, EBOFS_NODE_BLOCKS, freenode.get_buffer()); - } - - for (set::iterator i = r->committing.begin(); - i != r->committing.end(); - i++) - r->free.insert(*i); - r->committing.clear(); + *(unsigned char*)(bp.c_str() + byte) = x; + if (region == region_loc.size()) break; } + + + // write + bufferlist bl; + bl.append(bp); + dev.write(loc.start, loc.length, bl, + new C_NP_FlushUsemap(this)); + return 0; } + + + + // *** node commit *** + private: + bool is_committing() { + return (flushing > 0); + } + + class C_NP_FlushNode : public Context { + NodePool *pool; + nodeid_t nid; + public: + C_NP_FlushNode(NodePool *p, nodeid_t n) : + pool(p), nid(n) {} + void finish(int r) { + pool->flushed_node(nid); + } + }; + + void flushed_node(nodeid_t nid) { + lock.Lock(); + assert(tx.count(nid)); + + if (tx.count(nid)) { + tx.erase(nid); + clean.insert(nid); + } + else { + assert(limbo.count(nid)); + } + + flushing--; + if (flushing == 0) + commit_cond.Signal(); + lock.Unlock(); + } + + public: + void commit_start(BlockDevice& dev, version_t version) { + lock.Lock(); + assert(!is_committing()); + + // write map + flushing++; + write_usemap(dev,version & 1); + + // dirty -> tx (write to disk) + for (set::iterator i = dirty.begin(); + i != dirty.end(); + i++) { + Node *n = get_node(*i); + assert(n); + assert(n->is_dirty()); + n->set_state(Node::STATE_TX); + + unsigned region = nodeid_region(*i); + block_t off = nodeid_offset(*i); + + bufferlist bl; + bl.append(n->get_buffer()); + dev.write(region_loc[region].start + off, EBOFS_NODE_BLOCKS, + bl, + new C_NP_FlushNode(this, *i)); + flushing++; + + tx.insert(*i); + } + dirty.clear(); + + // limbo -> free + for (set::iterator i = limbo.begin(); + i != limbo.end(); + i++) { + free.insert(*i); + } + limbo.clear(); + + lock.Unlock(); + } + + void commit_wait() { + lock.Lock(); + while (is_committing()) { + commit_cond.Wait(lock); + } + lock.Unlock(); + } + + + + + + @@ -326,7 +421,7 @@ class NodePool { } // unopened node - /* + /* not implemented yet!! Node* open_node(nodeid_t nid) { Node *n = node_regions[ NodeRegion::nodeid_region(nid) ]->open_node(nid); dbtout << "pool.open_node " << n->get_id() << endl; @@ -335,47 +430,86 @@ class NodePool { } */ - // new node - nodeid_t new_nodeid() { - for (map::iterator i = node_regions.begin(); - i != node_regions.end(); - i++) { - if (i->second->num_free()) - return i->second->new_nodeid(); - } - assert(0); // full! - return -1; + // allocate id/block on disk. always free -> dirty. + nodeid_t alloc_id() { + // pick node id + assert(!free.empty()); + nodeid_t nid = *(free.begin()); + free.erase(nid); + dirty.insert(nid); + return nid; } - Node* new_node() { + + // new node + Node* new_node(int type) { + nodeid_t nid = alloc_id(); + dbtout << "pool.new_node " << nid << endl; + + // alloc node bufferptr bp = bufferpool.alloc(EBOFS_NODE_BYTES); - Node *n = new Node(new_nodeid(), bp); - n->clear(); - dbtout << "pool.new_node " << n->get_id() << endl; - assert(node_map.count(n->get_id()) == 0); - node_map[n->get_id()] = n; + Node *n = new Node(nid, bp, Node::STATE_DIRTY); + n->set_type(type); + n->set_size(0); + + assert(node_map.count(nid) == 0); + node_map[nid] = n; return n; } - void release_nodeid(nodeid_t nid) { - dbtout << "pool.release_nodeid on " << nid << endl; - assert(node_map.count(nid) == 0); - node_regions[ NodeRegion::nodeid_region(nid) ]->release(nid); - return; - } void release(Node *n) { - dbtout << "pool.release on " << n->get_id() << endl; - node_map.erase(n->get_id()); - release_nodeid(n->get_id()); + const nodeid_t nid = n->get_id(); + dbtout << "pool.release on " << nid << endl; + node_map.erase(nid); + + if (n->is_dirty()) { + dirty.erase(nid); + free.insert(nid); + } else if (n->is_clean()) { + clean.erase(nid); + limbo.insert(nid); + } else + assert(0); + delete n; } + void release_all() { + set left; + for (map::iterator i = node_map.begin(); + i != node_map.end(); + i++) + left.insert(i->second); + for (set::iterator i = left.begin(); + i != left.end(); + i++) + release( *i ); + assert(node_map.empty()); + } + void dirty_node(Node *n) { - assert(!n->is_dirty()); - n->set_status(Node::STATUS_DIRTY); - nodeid_t newid = new_nodeid(); - dbtout << "pool.dirty_node on " << n->get_id() << " now " << newid << endl; - node_map.erase(n->get_id()); - release_nodeid(n->get_id()); + // get new node id? + nodeid_t oldid = n->get_id(); + nodeid_t newid = alloc_id(); + dbtout << "pool.dirty_node on " << oldid << " now " << newid << endl; + + // release old block + if (n->is_clean()) { + assert(clean.count(oldid)); + clean.erase(oldid); + } else { + assert(n->is_tx()); + assert(tx.count(oldid)); + tx.erase(oldid); + + // move/copy current -> shadow buffer as necessary + n->make_shadow(bufferpool); + } + limbo.insert(oldid); + node_map.erase(oldid); + + n->set_state(Node::STATE_DIRTY); + + // move to new one! n->set_id(newid); node_map[newid] = n; } diff --git a/ceph/ebofs/types.h b/ceph/ebofs/types.h index 0931c3ca613..f5c34a873bc 100644 --- a/ceph/ebofs/types.h +++ b/ceph/ebofs/types.h @@ -75,6 +75,9 @@ static const int EBOFS_NODE_BYTES = EBOFS_NODE_BLOCKS * EBOFS_BLOCK_SIZE; static const int EBOFS_MAX_NODE_REGIONS = 10; // pick a better value! struct ebofs_nodepool { + Extent node_usemap_even; // for even sb versions + Extent node_usemap_odd; // for odd sb versions + int num_regions; Extent region_loc[EBOFS_MAX_NODE_REGIONS]; }; @@ -127,24 +130,23 @@ static const int EBOFS_NUM_FREE_BUCKETS = 16; /* see alloc.h for bucket constr struct ebofs_super { unsigned s_magic; - unsigned version; // version of this superblock. + unsigned epoch; // version of this superblock. unsigned num_blocks; /* # blocks in filesystem */ + + // basic stats, for kicks unsigned free_blocks; /* unused blocks */ - //unsigned num_btree_blocks; /* blocks devoted to btrees (sum of chunks) */ unsigned num_objects; - - /* stupid stuff */ unsigned num_fragmented; - - struct ebofs_table object_tab; // object directory - struct ebofs_table free_tab[EBOFS_NUM_FREE_BUCKETS]; - + + struct ebofs_nodepool nodepool; + + // tables + struct ebofs_table free_tab[EBOFS_NUM_FREE_BUCKETS]; + struct ebofs_table object_tab; // object directory struct ebofs_table collection_tab; // collection directory struct ebofs_table oc_tab; struct ebofs_table co_tab; - - struct ebofs_nodepool table_nodepool; }; diff --git a/ceph/include/interval_set.h b/ceph/include/interval_set.h new file mode 100644 index 00000000000..21fe454fad1 --- /dev/null +++ b/ceph/include/interval_set.h @@ -0,0 +1,194 @@ +#ifndef __INTERVAL_SET_H +#define __INTERVAL_SET_H + +#include +#include +#include +using namespace std; + + +template +class interval_set { + public: + map m; // map start -> len + + // helpers + private: + typename map::iterator find_inc(T start) { + typename map::iterator p = m.lower_bound(start); + if (p != m.begin() && + (p->first > start || p == m.end())) { + p--; // might overlap? + if (p->first + p->second <= start) + p++; // it doesn't. + } + return p; + } + + typename map::iterator find_adj(T start) { + typename map::iterator p = m.lower_bound(start); + if (p != m.begin() && + (p->first > start || p == m.end())) { + p--; // might touch? + if (p->first + p->second < start) + p++; // it doesn't. + } + return p; + } + + public: + + void clear() { + m.clear(); + } + + bool contains(T i) { + typename map::iterator p = find_inc(i); + if (p == end()) return false; + if (p->first > i) return false; + if (p->first+p->second <= i) return false; + assert(p->first <= i && p->first+p->second > i); + return true; + } + bool contains(T start, T len) { + typename map::iterator p = find_inc(start); + if (p == end()) return false; + if (p->first > start) return false; + if (p->first+p->second <= start) return false; + assert(p->first <= start && p->first+p->second > start); + if (p->first+p->second < start+len) return false; + return true; + } + + void insert(T start, T len) { + typename map::iterator p = find_adj(start); + if (p == m.end()) { + m[start] = len; // new interval + } else { + if (p->first < start) { + assert(p->first + p->second == start); + p->second += len; // append to end + + typename map::iterator n = p; + n++; + if (start+len == n->first) { // combine with next, too! + p->second += n->second; + m.erase(n); + } + } else { + if (start+len == p->first) { + m[start] = len + p->second; // append to front + m.erase(p); + } else { + assert(p->first > start+len); + m[start] = len; // new interval + } + } + } + } + + void erase(T start, T len) { + typename map::iterator p = find_inc(start); + + assert(p != m.end()); + assert(p->first <= start); + + T before = start - p->first; + assert(p->second >= before+len); + T after = p->second - before - len; + + if (before) + p->second = before; // shorten bit before + else + m.erase(p); + if (after) + m[start+len] = after; + } + + + void subtract(interval_set &a) { + for (typename map::iterator p = a.m.begin(); + p != a.m.end(); + p++) + erase(p->first, p->second); + } + + void insert(interval_set &a) { + for (typename map::iterator p = a.m.begin(); + p != a.m.end(); + p++) + insert(p->first, p->second); + } + + + void intersection_of(interval_set &a, interval_set &b) { + typename map::iterator pa = a.m.begin(); + typename map::iterator pb = b.m.begin(); + + while (pa != a.m.end() && pb != b.m.end()) { + // passing? + if (pa->first + pa->second <= pb->first) + { pa++; continue; } + if (pb->first + pb->second <= pa->first) + { pb++; continue; } + T start = MAX(pa->first, pb->first); + T end = MIN(pa->first+pa->second, pb->first+pb->second); + assert(end > start); + insert(start, end-start); + pa++; + pb++; + } + } + + void union_of(interval_set &a, interval_set &b) { + typename map::iterator pa = a.m.begin(); + typename map::iterator pb = b.m.begin(); + + while (pa != a.m.end() || pb != b.m.end()) { + // passing? + if (pb == b.m.end() || pa->first + pa->second <= pb->first) { + insert(pa->first, pa->second); + pa++; continue; + } + if (pa == a.m.end() || pb->first + pb->second <= pa->first) { + insert(pb->first, pb->second); + pb++; continue; + } + T start = MIN(pa->first, pb->first); + T end = MAX(pa->first+pa->second, pb->first+pb->second); + insert(start, end-start); + pa++; + pb++; + } + } + void union_of(interval_set &a) { + interval_set b; + b.m.swap(m); + union_of(a, b); + } + + bool subset_of(interval_set &big) { + for (typename map::iterator i = m.begin(); + i != m.end(); + i++) + if (!big.contains(i->first, i->second)) return false; + return true; + } + +}; + +template +inline ostream& operator<<(ostream& out, interval_set &s) { + out << "["; + for (typename map::iterator i = s.m.begin(); + i != s.m.end(); + i++) { + if (i != s.m.begin()) out << ","; + out << i->first << "~" << i->second; + } + out << "]"; + return out; +} + + +#endif