diff --git a/src/common/config_opts.h b/src/common/config_opts.h index a0e6ad7353d..cdf2d56c8ca 100644 --- a/src/common/config_opts.h +++ b/src/common/config_opts.h @@ -762,7 +762,7 @@ OPTION(keyvaluestore_rocksdb_options, OPT_STR, "") // rocksdb options that will be used for omap(if omap_backend is rocksdb) OPTION(filestore_rocksdb_options, OPT_STR, "") // rocksdb options that will be used in monstore -OPTION(mon_rocksdb_options, OPT_STR, "") +OPTION(mon_rocksdb_options, OPT_STR, "cache_size=536870912,write_buffer_size=33554432,block_size=65536,compression=kNoCompression") /** * osd_*_priority adjust the relative priority of client io, recovery io, @@ -814,7 +814,7 @@ OPTION(memstore_page_size, OPT_U64, 64 << 10) OPTION(newstore_max_dir_size, OPT_U32, 1000000) OPTION(newstore_onode_map_size, OPT_U32, 1024) // onodes per collection OPTION(newstore_backend, OPT_STR, "rocksdb") -OPTION(newstore_backend_options, OPT_STR, "") +OPTION(newstore_rocksdb_options, OPT_STR, "compression=kNoCompression,max_write_buffer_number=16,min_write_buffer_number_to_merge=6") OPTION(newstore_fail_eio, OPT_BOOL, true) OPTION(newstore_sync_io, OPT_BOOL, false) // perform initial io synchronously OPTION(newstore_sync_transaction, OPT_BOOL, false) // perform kv txn synchronously @@ -836,7 +836,6 @@ OPTION(newstore_overlay_max_length, OPT_INT, 65536) OPTION(newstore_overlay_max, OPT_INT, 32) OPTION(newstore_open_by_handle, OPT_BOOL, true) OPTION(newstore_o_direct, OPT_BOOL, true) -OPTION(newstore_db_path, OPT_STR, "") OPTION(newstore_aio, OPT_BOOL, true) OPTION(newstore_aio_poll_ms, OPT_INT, 250) // milliseconds OPTION(newstore_aio_max_queue_depth, OPT_INT, 4096) diff --git a/src/kv/RocksDBStore.cc b/src/kv/RocksDBStore.cc index 6dfde87a890..a6d071e48aa 100644 --- a/src/kv/RocksDBStore.cc +++ b/src/kv/RocksDBStore.cc @@ -6,6 +6,9 @@ #include #include #include +#include +#include +#include #include "rocksdb/db.h" #include "rocksdb/table.h" @@ -107,6 +110,26 @@ int RocksDBStore::init(string _options_str) return 0; } +int RocksDBStore::create_and_open(ostream &out) +{ + // create tertiary paths + string wal_path = path + ".wal"; + struct stat st; + int r = ::stat(wal_path.c_str(), &st); + if (r < 0) + r = -errno; + if (r == -ENOENT) { + unsigned slashoff = path.rfind('/'); + string target = path.substr(slashoff + 1); + r = ::symlink(target.c_str(), wal_path.c_str()); + if (r < 0) { + out << "failed to symlink " << wal_path << " to " << target; + return -errno; + } + } + return do_open(out, true); +} + int RocksDBStore::do_open(ostream &out, bool create_if_missing) { rocksdb::Options opt; @@ -117,6 +140,7 @@ int RocksDBStore::do_open(ostream &out, bool create_if_missing) return -EINVAL; } opt.create_if_missing = create_if_missing; + opt.wal_dir = path + ".wal"; status = rocksdb::DB::Open(opt, path, &db); if (!status.ok()) { diff --git a/src/kv/RocksDBStore.h b/src/kv/RocksDBStore.h index 257bb2d1c24..90523c451b7 100644 --- a/src/kv/RocksDBStore.h +++ b/src/kv/RocksDBStore.h @@ -124,9 +124,7 @@ public: return do_open(out, false); } /// Creates underlying db if missing and opens it - int create_and_open(ostream &out) { - return do_open(out, true); - } + int create_and_open(ostream &out); void close(); diff --git a/src/os/fs/FS.cc b/src/os/fs/FS.cc index cb0bdd53a13..b7c7987997c 100644 --- a/src/os/fs/FS.cc +++ b/src/os/fs/FS.cc @@ -121,16 +121,49 @@ int FS::zero(int fd, uint64_t offset, uint64_t length) { int r; -#ifdef CEPH_HAVE_FALLOCATE -# if !defined(DARWIN) && !defined(__FreeBSD__) + /* + + From the fallocate(2) man page: + + Specifying the FALLOC_FL_PUNCH_HOLE flag (available since Linux 2.6.38) + in mode deallocates space (i.e., creates a hole) in the byte range + starting at offset and continuing for len bytes. Within the specified + range, partial filesystem blocks are zeroed, and whole filesystem + blocks are removed from the file. After a successful call, subsequent + reads from this range will return zeroes. + + The FALLOC_FL_PUNCH_HOLE flag must be ORed with FALLOC_FL_KEEP_SIZE in + mode; in other words, even when punching off the end of the file, the + file size (as reported by stat(2)) does not change. + + Not all filesystems support FALLOC_FL_PUNCH_HOLE; if a filesystem + doesn't support the operation, an error is returned. The operation is + supported on at least the following filesystems: + + * XFS (since Linux 2.6.38) + + * ext4 (since Linux 3.0) + + * Btrfs (since Linux 3.7) + + * tmpfs (since Linux 3.5) + + So: we only do this is PUNCH_HOLE *and* KEEP_SIZE are defined. + + */ +#if !defined(DARWIN) && !defined(__FreeBSD__) +# ifdef CEPH_HAVE_FALLOCATE +# ifdef FALLOC_FL_KEEP_SIZE // first try fallocate - r = fallocate(fd, FALLOC_FL_PUNCH_HOLE, offset, length); + r = fallocate(fd, FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE, offset, length); if (r < 0) { r = -errno; } if (r != -EOPNOTSUPP) { goto out; // a real error } + // if that failed (-EOPNOTSUPP), fall back to writing zeros. +# endif # endif #endif @@ -140,7 +173,7 @@ int FS::zero(int fd, uint64_t offset, uint64_t length) bufferptr bp(length); bp.zero(); bl.append(bp); - int r = ::lseek64(fd, offset, SEEK_SET); + r = ::lseek64(fd, offset, SEEK_SET); if (r < 0) { r = -errno; goto out; diff --git a/src/os/newstore/NewStore.cc b/src/os/newstore/NewStore.cc index bb69babc235..e1b4d73ae16 100644 --- a/src/os/newstore/NewStore.cc +++ b/src/os/newstore/NewStore.cc @@ -564,7 +564,6 @@ NewStore::NewStore(CephContext *cct, const string& path) cct(cct), db(NULL), fs(NULL), - db_path(cct->_conf->newstore_db_path), path_fd(-1), fsid_fd(-1), frag_fd(-1), @@ -803,7 +802,7 @@ bool NewStore::test_mount_in_use() return ret; } -int NewStore::_open_db() +int NewStore::_open_db(bool create) { assert(!db); char fn[PATH_MAX]; @@ -817,17 +816,24 @@ int NewStore::_open_db() db = NULL; return -EIO; } - db->init(g_conf->newstore_backend_options); + string options; + if (g_conf->newstore_backend == "rocksdb") + options = g_conf->newstore_rocksdb_options; + db->init(options); stringstream err; - if (db->create_and_open(err)) { + int r; + if (create) + r = db->create_and_open(err); + else + r = db->open(err); + if (r) { derr << __func__ << " erroring opening db: " << err.str() << dendl; delete db; db = NULL; return -EIO; } dout(1) << __func__ << " opened " << g_conf->newstore_backend - << " path " << path - << " options " << g_conf->newstore_backend_options << dendl; + << " path " << path << " options " << options << dendl; return 0; } @@ -927,12 +933,7 @@ int NewStore::mkfs() if (r < 0) goto out_close_fsid; - if (db_path != "") { - r = symlinkat(db_path.c_str(), path_fd, "db"); - if (r < 0) - goto out_close_frag; - } - r = _open_db(); + r = _open_db(true); if (r < 0) goto out_close_frag; @@ -976,7 +977,7 @@ int NewStore::mount() // FIXME: superblock, features - r = _open_db(); + r = _open_db(false); if (r < 0) goto out_frag; diff --git a/src/os/newstore/NewStore.h b/src/os/newstore/NewStore.h index 0e3b39350ef..c32e2a9d45e 100644 --- a/src/os/newstore/NewStore.h +++ b/src/os/newstore/NewStore.h @@ -461,7 +461,6 @@ private: KeyValueDB *db; FS *fs; uuid_d fsid; - string db_path; int path_fd; ///< open handle to $path int fsid_fd; ///< open handle (locked) to $path/fsid int frag_fd; ///< open handle to $path/fragments @@ -525,7 +524,7 @@ private: int _open_frag(); int _create_frag(); void _close_frag(); - int _open_db(); + int _open_db(bool create); void _close_db(); int _open_collections(); void _close_collections();