From 7f07e1ee22a18528a2745edad0ba8b9dba08ff7d Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Sun, 20 Sep 2015 13:41:51 -0400 Subject: [PATCH 01/10] os/newstore: set rocksdb default options max_write_buffer_number=16 min_write_buffer_number_to_merge=6 This cuts the amount of short-lived WAL data that gets rewritten by roughly a factor of 6. Signed-off-by: Sage Weil --- src/common/config_opts.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/common/config_opts.h b/src/common/config_opts.h index dbad270a3b9..1d6edd8d03c 100644 --- a/src/common/config_opts.h +++ b/src/common/config_opts.h @@ -812,7 +812,7 @@ OPTION(memstore_page_size, OPT_U64, 64 << 10) OPTION(newstore_max_dir_size, OPT_U32, 1000000) OPTION(newstore_onode_map_size, OPT_U32, 1024) // onodes per collection OPTION(newstore_backend, OPT_STR, "rocksdb") -OPTION(newstore_backend_options, OPT_STR, "") +OPTION(newstore_backend_options, OPT_STR, "max_write_buffer_number=16,min_write_buffer_number_to_merge=6") OPTION(newstore_fail_eio, OPT_BOOL, true) OPTION(newstore_sync_io, OPT_BOOL, false) // perform initial io synchronously OPTION(newstore_sync_transaction, OPT_BOOL, false) // perform kv txn synchronously From 9c0ae4b86d8f8f78252c5d893c5d8960347cb218 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Mon, 28 Sep 2015 22:35:04 -0400 Subject: [PATCH 02/10] os/newstore: newstore_backend_options -> newstore_rocksdb_options This way we can have default settings per-backend. Also note that this is what we currently do with leveldb on the mon and osd. Signed-off-by: Sage Weil --- src/common/config_opts.h | 2 +- src/os/newstore/NewStore.cc | 8 +++++--- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/src/common/config_opts.h b/src/common/config_opts.h index 1d6edd8d03c..cc5de7858b5 100644 --- a/src/common/config_opts.h +++ b/src/common/config_opts.h @@ -812,7 +812,7 @@ OPTION(memstore_page_size, OPT_U64, 64 << 10) OPTION(newstore_max_dir_size, OPT_U32, 1000000) OPTION(newstore_onode_map_size, OPT_U32, 1024) // onodes per collection OPTION(newstore_backend, OPT_STR, "rocksdb") -OPTION(newstore_backend_options, OPT_STR, "max_write_buffer_number=16,min_write_buffer_number_to_merge=6") +OPTION(newstore_rocksdb_options, OPT_STR, "max_write_buffer_number=16,min_write_buffer_number_to_merge=6") OPTION(newstore_fail_eio, OPT_BOOL, true) OPTION(newstore_sync_io, OPT_BOOL, false) // perform initial io synchronously OPTION(newstore_sync_transaction, OPT_BOOL, false) // perform kv txn synchronously diff --git a/src/os/newstore/NewStore.cc b/src/os/newstore/NewStore.cc index bb69babc235..f25d1ea9cea 100644 --- a/src/os/newstore/NewStore.cc +++ b/src/os/newstore/NewStore.cc @@ -817,7 +817,10 @@ int NewStore::_open_db() db = NULL; return -EIO; } - db->init(g_conf->newstore_backend_options); + string options; + if (g_conf->newstore_backend == "rocksdb") + options = g_conf->newstore_rocksdb_options; + db->init(options); stringstream err; if (db->create_and_open(err)) { derr << __func__ << " erroring opening db: " << err.str() << dendl; @@ -826,8 +829,7 @@ int NewStore::_open_db() return -EIO; } dout(1) << __func__ << " opened " << g_conf->newstore_backend - << " path " << path - << " options " << g_conf->newstore_backend_options << dendl; + << " path " << path << " options " << options << dendl; return 0; } From 22c9310bd8663e458ba47b70def6f4fe2b843004 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Mon, 5 Oct 2015 15:34:58 -0400 Subject: [PATCH 03/10] os/newstore: remove newstore_db_path option It is simpler to have fixed locations and symlinks. Signed-off-by: Sage Weil --- src/common/config_opts.h | 1 - src/os/newstore/NewStore.cc | 6 ------ src/os/newstore/NewStore.h | 1 - 3 files changed, 8 deletions(-) diff --git a/src/common/config_opts.h b/src/common/config_opts.h index cc5de7858b5..d3fa4e00f00 100644 --- a/src/common/config_opts.h +++ b/src/common/config_opts.h @@ -834,7 +834,6 @@ OPTION(newstore_overlay_max_length, OPT_INT, 65536) OPTION(newstore_overlay_max, OPT_INT, 32) OPTION(newstore_open_by_handle, OPT_BOOL, true) OPTION(newstore_o_direct, OPT_BOOL, true) -OPTION(newstore_db_path, OPT_STR, "") OPTION(newstore_aio, OPT_BOOL, true) OPTION(newstore_aio_poll_ms, OPT_INT, 250) // milliseconds OPTION(newstore_aio_max_queue_depth, OPT_INT, 4096) diff --git a/src/os/newstore/NewStore.cc b/src/os/newstore/NewStore.cc index f25d1ea9cea..436dda799a2 100644 --- a/src/os/newstore/NewStore.cc +++ b/src/os/newstore/NewStore.cc @@ -564,7 +564,6 @@ NewStore::NewStore(CephContext *cct, const string& path) cct(cct), db(NULL), fs(NULL), - db_path(cct->_conf->newstore_db_path), path_fd(-1), fsid_fd(-1), frag_fd(-1), @@ -929,11 +928,6 @@ int NewStore::mkfs() if (r < 0) goto out_close_fsid; - if (db_path != "") { - r = symlinkat(db_path.c_str(), path_fd, "db"); - if (r < 0) - goto out_close_frag; - } r = _open_db(); if (r < 0) goto out_close_frag; diff --git a/src/os/newstore/NewStore.h b/src/os/newstore/NewStore.h index 0e3b39350ef..ae1eb9bc2b2 100644 --- a/src/os/newstore/NewStore.h +++ b/src/os/newstore/NewStore.h @@ -461,7 +461,6 @@ private: KeyValueDB *db; FS *fs; uuid_d fsid; - string db_path; int path_fd; ///< open handle to $path int fsid_fd; ///< open handle (locked) to $path/fsid int frag_fd; ///< open handle to $path/fragments From 0dac747c79be4da80c2034b44337f5c2b546b3ff Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Mon, 5 Oct 2015 16:01:04 -0400 Subject: [PATCH 04/10] os/newstore: distinguish between db open and create Signed-off-by: Sage Weil --- src/os/newstore/NewStore.cc | 13 +++++++++---- src/os/newstore/NewStore.h | 2 +- 2 files changed, 10 insertions(+), 5 deletions(-) diff --git a/src/os/newstore/NewStore.cc b/src/os/newstore/NewStore.cc index 436dda799a2..e1b4d73ae16 100644 --- a/src/os/newstore/NewStore.cc +++ b/src/os/newstore/NewStore.cc @@ -802,7 +802,7 @@ bool NewStore::test_mount_in_use() return ret; } -int NewStore::_open_db() +int NewStore::_open_db(bool create) { assert(!db); char fn[PATH_MAX]; @@ -821,7 +821,12 @@ int NewStore::_open_db() options = g_conf->newstore_rocksdb_options; db->init(options); stringstream err; - if (db->create_and_open(err)) { + int r; + if (create) + r = db->create_and_open(err); + else + r = db->open(err); + if (r) { derr << __func__ << " erroring opening db: " << err.str() << dendl; delete db; db = NULL; @@ -928,7 +933,7 @@ int NewStore::mkfs() if (r < 0) goto out_close_fsid; - r = _open_db(); + r = _open_db(true); if (r < 0) goto out_close_frag; @@ -972,7 +977,7 @@ int NewStore::mount() // FIXME: superblock, features - r = _open_db(); + r = _open_db(false); if (r < 0) goto out_frag; diff --git a/src/os/newstore/NewStore.h b/src/os/newstore/NewStore.h index ae1eb9bc2b2..c32e2a9d45e 100644 --- a/src/os/newstore/NewStore.h +++ b/src/os/newstore/NewStore.h @@ -524,7 +524,7 @@ private: int _open_frag(); int _create_frag(); void _close_frag(); - int _open_db(); + int _open_db(bool create); void _close_db(); int _open_collections(); void _close_collections(); From eddb00bd4f505d168cdbd53a8574313628fe8b91 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Mon, 5 Oct 2015 16:00:37 -0400 Subject: [PATCH 05/10] os/RocksDBStore: set up $path.wal -> $path symlink If $path.wal doesn't exist, create it and symlink it to $path. Set wal_dir to that. This makes it easy to move the wal content elsewhere later, or to pre-create the .wal dir. Signed-off-by: Sage Weil --- src/kv/RocksDBStore.cc | 24 ++++++++++++++++++++++++ src/kv/RocksDBStore.h | 4 +--- 2 files changed, 25 insertions(+), 3 deletions(-) diff --git a/src/kv/RocksDBStore.cc b/src/kv/RocksDBStore.cc index 6dfde87a890..a6d071e48aa 100644 --- a/src/kv/RocksDBStore.cc +++ b/src/kv/RocksDBStore.cc @@ -6,6 +6,9 @@ #include #include #include +#include +#include +#include #include "rocksdb/db.h" #include "rocksdb/table.h" @@ -107,6 +110,26 @@ int RocksDBStore::init(string _options_str) return 0; } +int RocksDBStore::create_and_open(ostream &out) +{ + // create tertiary paths + string wal_path = path + ".wal"; + struct stat st; + int r = ::stat(wal_path.c_str(), &st); + if (r < 0) + r = -errno; + if (r == -ENOENT) { + unsigned slashoff = path.rfind('/'); + string target = path.substr(slashoff + 1); + r = ::symlink(target.c_str(), wal_path.c_str()); + if (r < 0) { + out << "failed to symlink " << wal_path << " to " << target; + return -errno; + } + } + return do_open(out, true); +} + int RocksDBStore::do_open(ostream &out, bool create_if_missing) { rocksdb::Options opt; @@ -117,6 +140,7 @@ int RocksDBStore::do_open(ostream &out, bool create_if_missing) return -EINVAL; } opt.create_if_missing = create_if_missing; + opt.wal_dir = path + ".wal"; status = rocksdb::DB::Open(opt, path, &db); if (!status.ok()) { diff --git a/src/kv/RocksDBStore.h b/src/kv/RocksDBStore.h index 257bb2d1c24..90523c451b7 100644 --- a/src/kv/RocksDBStore.h +++ b/src/kv/RocksDBStore.h @@ -124,9 +124,7 @@ public: return do_open(out, false); } /// Creates underlying db if missing and opens it - int create_and_open(ostream &out) { - return do_open(out, true); - } + int create_and_open(ostream &out); void close(); From 2d921138f678282b2d8e2d46456720d4b7142a31 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Mon, 5 Oct 2015 14:47:14 -0400 Subject: [PATCH 06/10] os/fs/FS: fix zero() return value on fallback Signed-off-by: Sage Weil --- src/os/fs/FS.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/os/fs/FS.cc b/src/os/fs/FS.cc index cb0bdd53a13..63389486ae9 100644 --- a/src/os/fs/FS.cc +++ b/src/os/fs/FS.cc @@ -140,7 +140,7 @@ int FS::zero(int fd, uint64_t offset, uint64_t length) bufferptr bp(length); bp.zero(); bl.append(bp); - int r = ::lseek64(fd, offset, SEEK_SET); + r = ::lseek64(fd, offset, SEEK_SET); if (r < 0) { r = -errno; goto out; From ba60bf05b034b865dd5833ac7d4299f0dbce7c5e Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Mon, 5 Oct 2015 14:47:46 -0400 Subject: [PATCH 07/10] os/fs/FS: fix zero()'s PUNCH_HOLE incancation We get EOPNOTSUPP unconditionally without KEEP_SIZE. Signed-off-by: Sage Weil --- src/os/fs/FS.cc | 39 ++++++++++++++++++++++++++++++++++++--- 1 file changed, 36 insertions(+), 3 deletions(-) diff --git a/src/os/fs/FS.cc b/src/os/fs/FS.cc index 63389486ae9..b7c7987997c 100644 --- a/src/os/fs/FS.cc +++ b/src/os/fs/FS.cc @@ -121,16 +121,49 @@ int FS::zero(int fd, uint64_t offset, uint64_t length) { int r; -#ifdef CEPH_HAVE_FALLOCATE -# if !defined(DARWIN) && !defined(__FreeBSD__) + /* + + From the fallocate(2) man page: + + Specifying the FALLOC_FL_PUNCH_HOLE flag (available since Linux 2.6.38) + in mode deallocates space (i.e., creates a hole) in the byte range + starting at offset and continuing for len bytes. Within the specified + range, partial filesystem blocks are zeroed, and whole filesystem + blocks are removed from the file. After a successful call, subsequent + reads from this range will return zeroes. + + The FALLOC_FL_PUNCH_HOLE flag must be ORed with FALLOC_FL_KEEP_SIZE in + mode; in other words, even when punching off the end of the file, the + file size (as reported by stat(2)) does not change. + + Not all filesystems support FALLOC_FL_PUNCH_HOLE; if a filesystem + doesn't support the operation, an error is returned. The operation is + supported on at least the following filesystems: + + * XFS (since Linux 2.6.38) + + * ext4 (since Linux 3.0) + + * Btrfs (since Linux 3.7) + + * tmpfs (since Linux 3.5) + + So: we only do this is PUNCH_HOLE *and* KEEP_SIZE are defined. + + */ +#if !defined(DARWIN) && !defined(__FreeBSD__) +# ifdef CEPH_HAVE_FALLOCATE +# ifdef FALLOC_FL_KEEP_SIZE // first try fallocate - r = fallocate(fd, FALLOC_FL_PUNCH_HOLE, offset, length); + r = fallocate(fd, FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE, offset, length); if (r < 0) { r = -errno; } if (r != -EOPNOTSUPP) { goto out; // a real error } + // if that failed (-EOPNOTSUPP), fall back to writing zeros. +# endif # endif #endif From ae516d7d63d94c11c4c010f607b9d23779d247b2 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Fri, 2 Oct 2015 09:15:33 -0400 Subject: [PATCH 08/10] mon: disabled rocksdb compression when used as the backend This significantly reduced CPU utilization on the bigbang scale testing cluster at CERN. Note that it is already disabled for leveldb by default (in ceph_mon.cc). Signed-off-by: Sage Weil --- src/common/config_opts.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/common/config_opts.h b/src/common/config_opts.h index d3fa4e00f00..ec57ee959df 100644 --- a/src/common/config_opts.h +++ b/src/common/config_opts.h @@ -760,7 +760,7 @@ OPTION(keyvaluestore_rocksdb_options, OPT_STR, "") // rocksdb options that will be used for omap(if omap_backend is rocksdb) OPTION(filestore_rocksdb_options, OPT_STR, "") // rocksdb options that will be used in monstore -OPTION(mon_rocksdb_options, OPT_STR, "") +OPTION(mon_rocksdb_options, OPT_STR, "compression=kNoCompression") /** * osd_*_priority adjust the relative priority of client io, recovery io, From d8854890620d3f3fd12581ba512761e311341c00 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Tue, 6 Oct 2015 14:38:47 -0400 Subject: [PATCH 09/10] common: mirror leveldb default tuning w/ rocksdb Signed-off-by: Sage Weil --- src/common/config_opts.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/common/config_opts.h b/src/common/config_opts.h index ec57ee959df..1d128a2a598 100644 --- a/src/common/config_opts.h +++ b/src/common/config_opts.h @@ -760,7 +760,7 @@ OPTION(keyvaluestore_rocksdb_options, OPT_STR, "") // rocksdb options that will be used for omap(if omap_backend is rocksdb) OPTION(filestore_rocksdb_options, OPT_STR, "") // rocksdb options that will be used in monstore -OPTION(mon_rocksdb_options, OPT_STR, "compression=kNoCompression") +OPTION(mon_rocksdb_options, OPT_STR, "cache_size=536870912,write_buffer_size=33554432,block_size=65536,compression=kNoCompression") /** * osd_*_priority adjust the relative priority of client io, recovery io, From 6df48f8e38428587695c8a8bd448bc9ad1cd6af7 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Fri, 16 Oct 2015 13:07:29 -0400 Subject: [PATCH 10/10] os/newstore: disable rocksdb compression This has been shown to be problematic for performance on the monitor. Note that this takes us from ~170/bytes per onode to ~540/bytes per onode. (The encoded onode_t is 390 bytes, not including the key name.) Signed-off-by: Sage Weil --- src/common/config_opts.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/common/config_opts.h b/src/common/config_opts.h index 1d128a2a598..8fc62da0589 100644 --- a/src/common/config_opts.h +++ b/src/common/config_opts.h @@ -812,7 +812,7 @@ OPTION(memstore_page_size, OPT_U64, 64 << 10) OPTION(newstore_max_dir_size, OPT_U32, 1000000) OPTION(newstore_onode_map_size, OPT_U32, 1024) // onodes per collection OPTION(newstore_backend, OPT_STR, "rocksdb") -OPTION(newstore_rocksdb_options, OPT_STR, "max_write_buffer_number=16,min_write_buffer_number_to_merge=6") +OPTION(newstore_rocksdb_options, OPT_STR, "compression=kNoCompression,max_write_buffer_number=16,min_write_buffer_number_to_merge=6") OPTION(newstore_fail_eio, OPT_BOOL, true) OPTION(newstore_sync_io, OPT_BOOL, false) // perform initial io synchronously OPTION(newstore_sync_transaction, OPT_BOOL, false) // perform kv txn synchronously