diff --git a/ceph/Makefile b/ceph/Makefile index 8b1b922bc0f..4c34b6414e8 100644 --- a/ceph/Makefile +++ b/ceph/Makefile @@ -8,7 +8,7 @@ # This makes it less annoying to build on non-mpi hosts for dev work, and seems to # behave just fine... change ${CC} back to mpicxx if you get paranoid. CC = g++ -CFLAGS = -g -Wall -I. -D_FILE_OFFSET_BITS=64 -DMPICH_IGNORE_CXX_SEEK -D_REENTRANT -D_THREAD_SAFE -DUSE_EBOFS +CFLAGS = -pg -g -Wall -I. -D_FILE_OFFSET_BITS=64 -DMPICH_IGNORE_CXX_SEEK -D_REENTRANT -D_THREAD_SAFE -DUSE_EBOFS LIBS = -lpthread -lrt -ldb #for normal mpich2 machines diff --git a/ceph/config.cc b/ceph/config.cc index 52b759017c6..35f8ecd0b9c 100644 --- a/ceph/config.cc +++ b/ceph/config.cc @@ -116,8 +116,9 @@ md_config_t g_conf = { osd_fakestore_syncthreads: 4, osd_ebofs: 0, - ebofs_bc_size: (50 *256), // measured in 4k blocks, or *256 for MB - ebofs_bc_max_dirty: ebofs_bc_size*8/10, // before write() will wait for data to flush + ebofs_commit_interval: 2, // seconds. 0 = no timeout (for debugging/tracing) + ebofs_bc_size: (50 *256), // measured in 4k blocks, or *256 for MB + ebofs_bc_max_dirty: (40 *256), // before write() will wait for data to flush // --- fakeclient (mds regression testing) (ancient history) --- diff --git a/ceph/config.h b/ceph/config.h index 1048e08338c..af8afbf70bc 100644 --- a/ceph/config.h +++ b/ceph/config.h @@ -91,6 +91,7 @@ struct md_config_t { int osd_fakestore_syncthreads; // such crap int osd_ebofs; + int ebofs_commit_interval; off_t ebofs_bc_size; off_t ebofs_bc_max_dirty; diff --git a/ceph/crush/Bucket.h b/ceph/crush/Bucket.h index 054ea4d832b..0ce44cacb44 100644 --- a/ceph/crush/Bucket.h +++ b/ceph/crush/Bucket.h @@ -10,6 +10,8 @@ #include using namespace std; +#include + #include "include/bufferlist.h" namespace crush { diff --git a/ceph/ebofs/AlignedBufferPool.h b/ceph/ebofs/AlignedBufferPool.h index ab099fcefc9..4af885247f4 100644 --- a/ceph/ebofs/AlignedBufferPool.h +++ b/ceph/ebofs/AlignedBufferPool.h @@ -26,12 +26,12 @@ class AlignedBufferPool { bool dommap; public: - AlignedBufferPool(int a) : alignment(a), dommap(false) {} + AlignedBufferPool(int a) : alignment(a), dommap(true) {} ~AlignedBufferPool() { } void free(char *p, unsigned len) { - dout(30) << "bufferpool.free " << (void*)p << " len " << len << endl; + dout(1) << "bufferpool.free " << (void*)p << " len " << len << endl; if (dommap) munmap(p, len); else @@ -54,7 +54,7 @@ class AlignedBufferPool { ::memset(p, 0, bytes); // only to shut up valgrind - dout(30) << "bufferpool.alloc " << (void*)p << endl; + dout(1) << "bufferpool.alloc " << (void*)p << endl; return new buffer(p, bytes, BUFFER_MODE_NOCOPY|BUFFER_MODE_NOFREE|BUFFER_MODE_CUSTOMFREE, bytes, diff --git a/ceph/ebofs/Ebofs.cc b/ceph/ebofs/Ebofs.cc index 405d0ef1286..85c1a0d3f37 100644 --- a/ceph/ebofs/Ebofs.cc +++ b/ceph/ebofs/Ebofs.cc @@ -1,6 +1,9 @@ #include "Ebofs.h" +#include + + // ******************* #undef dout @@ -84,7 +87,7 @@ int Ebofs::mkfs() // create first noderegion Extent nr; nr.start = 2; - nr.length = num_blocks / 100; + nr.length = 10+ (num_blocks / 1000); if (nr.length < 10) nr.length = 10; nodepool.add_region(nr); dout(1) << "mkfs: first node region at " << nr << endl; @@ -271,7 +274,7 @@ int Ebofs::commit_thread_entry() while (mounted) { // wait for kick, or timeout - if (EBOFS_COMMIT_INTERVAL) { + if (g_conf.ebofs_commit_interval) { commit_cond.WaitInterval(ebofs_lock, utime_t(EBOFS_COMMIT_INTERVAL,0)); } else { // DEBUG.. wait until kicked @@ -1364,6 +1367,20 @@ int Ebofs::read(object_t oid, } +bool Ebofs::_write_will_block() +{ + return (bc.get_stat_dirty()+bc.get_stat_tx() > g_conf.ebofs_bc_max_dirty); +} + +bool Ebofs::write_will_block() +{ + ebofs_lock.Lock(); + bool b = _write_will_block(); + ebofs_lock.Unlock(); + return b; +} + + int Ebofs::write(object_t oid, size_t len, off_t off, bufferlist& bl, bool fsync) @@ -1401,6 +1418,12 @@ int Ebofs::write(object_t oid, ebofs_lock.Unlock(); return -ENOSPC; } + + // too much unflushed dirty data? (if so, block!) + while (_write_will_block()) { + dout(1) << "write blocking on write" << endl; + bc.waitfor_stat(); + } // get|create inode Onode *on = get_onode(oid); diff --git a/ceph/ebofs/Ebofs.h b/ceph/ebofs/Ebofs.h index c2a9031a0f0..1cdc1398b33 100644 --- a/ceph/ebofs/Ebofs.h +++ b/ceph/ebofs/Ebofs.h @@ -170,6 +170,8 @@ class Ebofs : public ObjectStore { void* entry() { return (void*)ebofs->finisher_thread_entry(); } } finisher_thread; + bool _write_will_block(); + public: Ebofs(BlockDevice& d) : dev(d), @@ -209,6 +211,8 @@ class Ebofs : public ObjectStore { int truncate(object_t oid, off_t size); int remove(object_t oid); + bool write_will_block(); + // object attr int setattr(object_t oid, const char *name, void *value, size_t size); int getattr(object_t oid, const char *name, void *value, size_t size); diff --git a/ceph/ebofs/Onode.h b/ceph/ebofs/Onode.h index 62086814976..1e9021a0342 100644 --- a/ceph/ebofs/Onode.h +++ b/ceph/ebofs/Onode.h @@ -111,21 +111,26 @@ public: void verify_extents() { block_t count = 0; interval_set is; - set s; - for (unsigned i=0; i s; + cout << "verifying" << endl; + for (unsigned i=0; i= 0); assert(off + len <= length()); - + /*assert(off < length()); + if (off + len > length()) + len = length() - off; + */ // advance to off list::iterator curbuf = _buffers.begin(); diff --git a/ceph/osd/OSD.cc b/ceph/osd/OSD.cc index 31fa3fed197..56d28e5624e 100644 --- a/ceph/osd/OSD.cc +++ b/ceph/osd/OSD.cc @@ -86,7 +86,10 @@ OSD::OSD(int id, Messenger *m) # ifdef USE_EBOFS storedev = 0; if (g_conf.osd_ebofs) { - sprintf(ebofs_path, "%s/%d", ebofs_base_path, whoami); + char hostname[100]; + hostname[0] = 0; + gethostname(hostname,100); + sprintf(ebofs_path, "%s/%s", ebofs_base_path, hostname); storedev = new BlockDevice(ebofs_path); store = new Ebofs(*storedev); } else diff --git a/ceph/tcpsyn.cc b/ceph/tcpsyn.cc index db10013e567..851278b5c1c 100644 --- a/ceph/tcpsyn.cc +++ b/ceph/tcpsyn.cc @@ -120,6 +120,15 @@ int main(int argc, char **argv) char s[80]; sprintf(s,"clnode.%d", myrank); client_logger = new Logger(s, &client_logtype); + + client_logtype.add_inc("lsum"); + client_logtype.add_inc("lnum"); + client_logtype.add_inc("lwsum"); + client_logtype.add_inc("lwnum"); + client_logtype.add_inc("lrsum"); + client_logtype.add_inc("lrnum"); + client_logtype.add_inc("trsum"); + client_logtype.add_inc("trnum"); } client[i]->init(); @@ -131,12 +140,16 @@ int main(int argc, char **argv) it != clientlist.end(); it++) { int i = *it; - // use my argc, argv (make sure you pass a mount point!) - //cout << "mounting" << endl; + + client[i]->mount(mkfs); //cout << "starting synthetic client on rank " << myrank << endl; syn[i] = new SyntheticClient(client[i]); + + syn[i]->modes = syn_modes; + syn[i]->sargs = syn_sargs; + syn[i]->iargs = syn_iargs; syn[i]->start_thread();