Merge pull request #6609 from liewegas/wip-newstore

newstore: misc updates (including kv and os/fs stuff)
This commit is contained in:
Sage Weil 2015-11-20 06:55:14 -05:00
commit dcd1834ca7
6 changed files with 79 additions and 25 deletions

View File

@ -762,7 +762,7 @@ OPTION(keyvaluestore_rocksdb_options, OPT_STR, "")
// rocksdb options that will be used for omap(if omap_backend is rocksdb)
OPTION(filestore_rocksdb_options, OPT_STR, "")
// rocksdb options that will be used in monstore
OPTION(mon_rocksdb_options, OPT_STR, "")
OPTION(mon_rocksdb_options, OPT_STR, "cache_size=536870912,write_buffer_size=33554432,block_size=65536,compression=kNoCompression")
/**
* osd_*_priority adjust the relative priority of client io, recovery io,
@ -814,7 +814,7 @@ OPTION(memstore_page_size, OPT_U64, 64 << 10)
OPTION(newstore_max_dir_size, OPT_U32, 1000000)
OPTION(newstore_onode_map_size, OPT_U32, 1024) // onodes per collection
OPTION(newstore_backend, OPT_STR, "rocksdb")
OPTION(newstore_backend_options, OPT_STR, "")
OPTION(newstore_rocksdb_options, OPT_STR, "compression=kNoCompression,max_write_buffer_number=16,min_write_buffer_number_to_merge=6")
OPTION(newstore_fail_eio, OPT_BOOL, true)
OPTION(newstore_sync_io, OPT_BOOL, false) // perform initial io synchronously
OPTION(newstore_sync_transaction, OPT_BOOL, false) // perform kv txn synchronously
@ -836,7 +836,6 @@ OPTION(newstore_overlay_max_length, OPT_INT, 65536)
OPTION(newstore_overlay_max, OPT_INT, 32)
OPTION(newstore_open_by_handle, OPT_BOOL, true)
OPTION(newstore_o_direct, OPT_BOOL, true)
OPTION(newstore_db_path, OPT_STR, "")
OPTION(newstore_aio, OPT_BOOL, true)
OPTION(newstore_aio_poll_ms, OPT_INT, 250) // milliseconds
OPTION(newstore_aio_max_queue_depth, OPT_INT, 4096)

View File

@ -6,6 +6,9 @@
#include <string>
#include <memory>
#include <errno.h>
#include <unistd.h>
#include <sys/types.h>
#include <sys/stat.h>
#include "rocksdb/db.h"
#include "rocksdb/table.h"
@ -107,6 +110,26 @@ int RocksDBStore::init(string _options_str)
return 0;
}
int RocksDBStore::create_and_open(ostream &out)
{
// create tertiary paths
string wal_path = path + ".wal";
struct stat st;
int r = ::stat(wal_path.c_str(), &st);
if (r < 0)
r = -errno;
if (r == -ENOENT) {
unsigned slashoff = path.rfind('/');
string target = path.substr(slashoff + 1);
r = ::symlink(target.c_str(), wal_path.c_str());
if (r < 0) {
out << "failed to symlink " << wal_path << " to " << target;
return -errno;
}
}
return do_open(out, true);
}
int RocksDBStore::do_open(ostream &out, bool create_if_missing)
{
rocksdb::Options opt;
@ -117,6 +140,7 @@ int RocksDBStore::do_open(ostream &out, bool create_if_missing)
return -EINVAL;
}
opt.create_if_missing = create_if_missing;
opt.wal_dir = path + ".wal";
status = rocksdb::DB::Open(opt, path, &db);
if (!status.ok()) {

View File

@ -124,9 +124,7 @@ public:
return do_open(out, false);
}
/// Creates underlying db if missing and opens it
int create_and_open(ostream &out) {
return do_open(out, true);
}
int create_and_open(ostream &out);
void close();

View File

@ -121,16 +121,49 @@ int FS::zero(int fd, uint64_t offset, uint64_t length)
{
int r;
#ifdef CEPH_HAVE_FALLOCATE
# if !defined(DARWIN) && !defined(__FreeBSD__)
/*
From the fallocate(2) man page:
Specifying the FALLOC_FL_PUNCH_HOLE flag (available since Linux 2.6.38)
in mode deallocates space (i.e., creates a hole) in the byte range
starting at offset and continuing for len bytes. Within the specified
range, partial filesystem blocks are zeroed, and whole filesystem
blocks are removed from the file. After a successful call, subsequent
reads from this range will return zeroes.
The FALLOC_FL_PUNCH_HOLE flag must be ORed with FALLOC_FL_KEEP_SIZE in
mode; in other words, even when punching off the end of the file, the
file size (as reported by stat(2)) does not change.
Not all filesystems support FALLOC_FL_PUNCH_HOLE; if a filesystem
doesn't support the operation, an error is returned. The operation is
supported on at least the following filesystems:
* XFS (since Linux 2.6.38)
* ext4 (since Linux 3.0)
* Btrfs (since Linux 3.7)
* tmpfs (since Linux 3.5)
So: we only do this is PUNCH_HOLE *and* KEEP_SIZE are defined.
*/
#if !defined(DARWIN) && !defined(__FreeBSD__)
# ifdef CEPH_HAVE_FALLOCATE
# ifdef FALLOC_FL_KEEP_SIZE
// first try fallocate
r = fallocate(fd, FALLOC_FL_PUNCH_HOLE, offset, length);
r = fallocate(fd, FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE, offset, length);
if (r < 0) {
r = -errno;
}
if (r != -EOPNOTSUPP) {
goto out; // a real error
}
// if that failed (-EOPNOTSUPP), fall back to writing zeros.
# endif
# endif
#endif
@ -140,7 +173,7 @@ int FS::zero(int fd, uint64_t offset, uint64_t length)
bufferptr bp(length);
bp.zero();
bl.append(bp);
int r = ::lseek64(fd, offset, SEEK_SET);
r = ::lseek64(fd, offset, SEEK_SET);
if (r < 0) {
r = -errno;
goto out;

View File

@ -564,7 +564,6 @@ NewStore::NewStore(CephContext *cct, const string& path)
cct(cct),
db(NULL),
fs(NULL),
db_path(cct->_conf->newstore_db_path),
path_fd(-1),
fsid_fd(-1),
frag_fd(-1),
@ -803,7 +802,7 @@ bool NewStore::test_mount_in_use()
return ret;
}
int NewStore::_open_db()
int NewStore::_open_db(bool create)
{
assert(!db);
char fn[PATH_MAX];
@ -817,17 +816,24 @@ int NewStore::_open_db()
db = NULL;
return -EIO;
}
db->init(g_conf->newstore_backend_options);
string options;
if (g_conf->newstore_backend == "rocksdb")
options = g_conf->newstore_rocksdb_options;
db->init(options);
stringstream err;
if (db->create_and_open(err)) {
int r;
if (create)
r = db->create_and_open(err);
else
r = db->open(err);
if (r) {
derr << __func__ << " erroring opening db: " << err.str() << dendl;
delete db;
db = NULL;
return -EIO;
}
dout(1) << __func__ << " opened " << g_conf->newstore_backend
<< " path " << path
<< " options " << g_conf->newstore_backend_options << dendl;
<< " path " << path << " options " << options << dendl;
return 0;
}
@ -927,12 +933,7 @@ int NewStore::mkfs()
if (r < 0)
goto out_close_fsid;
if (db_path != "") {
r = symlinkat(db_path.c_str(), path_fd, "db");
if (r < 0)
goto out_close_frag;
}
r = _open_db();
r = _open_db(true);
if (r < 0)
goto out_close_frag;
@ -976,7 +977,7 @@ int NewStore::mount()
// FIXME: superblock, features
r = _open_db();
r = _open_db(false);
if (r < 0)
goto out_frag;

View File

@ -461,7 +461,6 @@ private:
KeyValueDB *db;
FS *fs;
uuid_d fsid;
string db_path;
int path_fd; ///< open handle to $path
int fsid_fd; ///< open handle (locked) to $path/fsid
int frag_fd; ///< open handle to $path/fragments
@ -525,7 +524,7 @@ private:
int _open_frag();
int _create_frag();
void _close_frag();
int _open_db();
int _open_db(bool create);
void _close_db();
int _open_collections();
void _close_collections();