mirror of
https://github.com/ceph/ceph
synced 2025-01-03 09:32:43 +00:00
Merge pull request #6609 from liewegas/wip-newstore
newstore: misc updates (including kv and os/fs stuff)
This commit is contained in:
commit
dcd1834ca7
@ -762,7 +762,7 @@ OPTION(keyvaluestore_rocksdb_options, OPT_STR, "")
|
||||
// rocksdb options that will be used for omap(if omap_backend is rocksdb)
|
||||
OPTION(filestore_rocksdb_options, OPT_STR, "")
|
||||
// rocksdb options that will be used in monstore
|
||||
OPTION(mon_rocksdb_options, OPT_STR, "")
|
||||
OPTION(mon_rocksdb_options, OPT_STR, "cache_size=536870912,write_buffer_size=33554432,block_size=65536,compression=kNoCompression")
|
||||
|
||||
/**
|
||||
* osd_*_priority adjust the relative priority of client io, recovery io,
|
||||
@ -814,7 +814,7 @@ OPTION(memstore_page_size, OPT_U64, 64 << 10)
|
||||
OPTION(newstore_max_dir_size, OPT_U32, 1000000)
|
||||
OPTION(newstore_onode_map_size, OPT_U32, 1024) // onodes per collection
|
||||
OPTION(newstore_backend, OPT_STR, "rocksdb")
|
||||
OPTION(newstore_backend_options, OPT_STR, "")
|
||||
OPTION(newstore_rocksdb_options, OPT_STR, "compression=kNoCompression,max_write_buffer_number=16,min_write_buffer_number_to_merge=6")
|
||||
OPTION(newstore_fail_eio, OPT_BOOL, true)
|
||||
OPTION(newstore_sync_io, OPT_BOOL, false) // perform initial io synchronously
|
||||
OPTION(newstore_sync_transaction, OPT_BOOL, false) // perform kv txn synchronously
|
||||
@ -836,7 +836,6 @@ OPTION(newstore_overlay_max_length, OPT_INT, 65536)
|
||||
OPTION(newstore_overlay_max, OPT_INT, 32)
|
||||
OPTION(newstore_open_by_handle, OPT_BOOL, true)
|
||||
OPTION(newstore_o_direct, OPT_BOOL, true)
|
||||
OPTION(newstore_db_path, OPT_STR, "")
|
||||
OPTION(newstore_aio, OPT_BOOL, true)
|
||||
OPTION(newstore_aio_poll_ms, OPT_INT, 250) // milliseconds
|
||||
OPTION(newstore_aio_max_queue_depth, OPT_INT, 4096)
|
||||
|
@ -6,6 +6,9 @@
|
||||
#include <string>
|
||||
#include <memory>
|
||||
#include <errno.h>
|
||||
#include <unistd.h>
|
||||
#include <sys/types.h>
|
||||
#include <sys/stat.h>
|
||||
|
||||
#include "rocksdb/db.h"
|
||||
#include "rocksdb/table.h"
|
||||
@ -107,6 +110,26 @@ int RocksDBStore::init(string _options_str)
|
||||
return 0;
|
||||
}
|
||||
|
||||
int RocksDBStore::create_and_open(ostream &out)
|
||||
{
|
||||
// create tertiary paths
|
||||
string wal_path = path + ".wal";
|
||||
struct stat st;
|
||||
int r = ::stat(wal_path.c_str(), &st);
|
||||
if (r < 0)
|
||||
r = -errno;
|
||||
if (r == -ENOENT) {
|
||||
unsigned slashoff = path.rfind('/');
|
||||
string target = path.substr(slashoff + 1);
|
||||
r = ::symlink(target.c_str(), wal_path.c_str());
|
||||
if (r < 0) {
|
||||
out << "failed to symlink " << wal_path << " to " << target;
|
||||
return -errno;
|
||||
}
|
||||
}
|
||||
return do_open(out, true);
|
||||
}
|
||||
|
||||
int RocksDBStore::do_open(ostream &out, bool create_if_missing)
|
||||
{
|
||||
rocksdb::Options opt;
|
||||
@ -117,6 +140,7 @@ int RocksDBStore::do_open(ostream &out, bool create_if_missing)
|
||||
return -EINVAL;
|
||||
}
|
||||
opt.create_if_missing = create_if_missing;
|
||||
opt.wal_dir = path + ".wal";
|
||||
|
||||
status = rocksdb::DB::Open(opt, path, &db);
|
||||
if (!status.ok()) {
|
||||
|
@ -124,9 +124,7 @@ public:
|
||||
return do_open(out, false);
|
||||
}
|
||||
/// Creates underlying db if missing and opens it
|
||||
int create_and_open(ostream &out) {
|
||||
return do_open(out, true);
|
||||
}
|
||||
int create_and_open(ostream &out);
|
||||
|
||||
void close();
|
||||
|
||||
|
@ -121,16 +121,49 @@ int FS::zero(int fd, uint64_t offset, uint64_t length)
|
||||
{
|
||||
int r;
|
||||
|
||||
#ifdef CEPH_HAVE_FALLOCATE
|
||||
# if !defined(DARWIN) && !defined(__FreeBSD__)
|
||||
/*
|
||||
|
||||
From the fallocate(2) man page:
|
||||
|
||||
Specifying the FALLOC_FL_PUNCH_HOLE flag (available since Linux 2.6.38)
|
||||
in mode deallocates space (i.e., creates a hole) in the byte range
|
||||
starting at offset and continuing for len bytes. Within the specified
|
||||
range, partial filesystem blocks are zeroed, and whole filesystem
|
||||
blocks are removed from the file. After a successful call, subsequent
|
||||
reads from this range will return zeroes.
|
||||
|
||||
The FALLOC_FL_PUNCH_HOLE flag must be ORed with FALLOC_FL_KEEP_SIZE in
|
||||
mode; in other words, even when punching off the end of the file, the
|
||||
file size (as reported by stat(2)) does not change.
|
||||
|
||||
Not all filesystems support FALLOC_FL_PUNCH_HOLE; if a filesystem
|
||||
doesn't support the operation, an error is returned. The operation is
|
||||
supported on at least the following filesystems:
|
||||
|
||||
* XFS (since Linux 2.6.38)
|
||||
|
||||
* ext4 (since Linux 3.0)
|
||||
|
||||
* Btrfs (since Linux 3.7)
|
||||
|
||||
* tmpfs (since Linux 3.5)
|
||||
|
||||
So: we only do this is PUNCH_HOLE *and* KEEP_SIZE are defined.
|
||||
|
||||
*/
|
||||
#if !defined(DARWIN) && !defined(__FreeBSD__)
|
||||
# ifdef CEPH_HAVE_FALLOCATE
|
||||
# ifdef FALLOC_FL_KEEP_SIZE
|
||||
// first try fallocate
|
||||
r = fallocate(fd, FALLOC_FL_PUNCH_HOLE, offset, length);
|
||||
r = fallocate(fd, FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE, offset, length);
|
||||
if (r < 0) {
|
||||
r = -errno;
|
||||
}
|
||||
if (r != -EOPNOTSUPP) {
|
||||
goto out; // a real error
|
||||
}
|
||||
// if that failed (-EOPNOTSUPP), fall back to writing zeros.
|
||||
# endif
|
||||
# endif
|
||||
#endif
|
||||
|
||||
@ -140,7 +173,7 @@ int FS::zero(int fd, uint64_t offset, uint64_t length)
|
||||
bufferptr bp(length);
|
||||
bp.zero();
|
||||
bl.append(bp);
|
||||
int r = ::lseek64(fd, offset, SEEK_SET);
|
||||
r = ::lseek64(fd, offset, SEEK_SET);
|
||||
if (r < 0) {
|
||||
r = -errno;
|
||||
goto out;
|
||||
|
@ -564,7 +564,6 @@ NewStore::NewStore(CephContext *cct, const string& path)
|
||||
cct(cct),
|
||||
db(NULL),
|
||||
fs(NULL),
|
||||
db_path(cct->_conf->newstore_db_path),
|
||||
path_fd(-1),
|
||||
fsid_fd(-1),
|
||||
frag_fd(-1),
|
||||
@ -803,7 +802,7 @@ bool NewStore::test_mount_in_use()
|
||||
return ret;
|
||||
}
|
||||
|
||||
int NewStore::_open_db()
|
||||
int NewStore::_open_db(bool create)
|
||||
{
|
||||
assert(!db);
|
||||
char fn[PATH_MAX];
|
||||
@ -817,17 +816,24 @@ int NewStore::_open_db()
|
||||
db = NULL;
|
||||
return -EIO;
|
||||
}
|
||||
db->init(g_conf->newstore_backend_options);
|
||||
string options;
|
||||
if (g_conf->newstore_backend == "rocksdb")
|
||||
options = g_conf->newstore_rocksdb_options;
|
||||
db->init(options);
|
||||
stringstream err;
|
||||
if (db->create_and_open(err)) {
|
||||
int r;
|
||||
if (create)
|
||||
r = db->create_and_open(err);
|
||||
else
|
||||
r = db->open(err);
|
||||
if (r) {
|
||||
derr << __func__ << " erroring opening db: " << err.str() << dendl;
|
||||
delete db;
|
||||
db = NULL;
|
||||
return -EIO;
|
||||
}
|
||||
dout(1) << __func__ << " opened " << g_conf->newstore_backend
|
||||
<< " path " << path
|
||||
<< " options " << g_conf->newstore_backend_options << dendl;
|
||||
<< " path " << path << " options " << options << dendl;
|
||||
return 0;
|
||||
}
|
||||
|
||||
@ -927,12 +933,7 @@ int NewStore::mkfs()
|
||||
if (r < 0)
|
||||
goto out_close_fsid;
|
||||
|
||||
if (db_path != "") {
|
||||
r = symlinkat(db_path.c_str(), path_fd, "db");
|
||||
if (r < 0)
|
||||
goto out_close_frag;
|
||||
}
|
||||
r = _open_db();
|
||||
r = _open_db(true);
|
||||
if (r < 0)
|
||||
goto out_close_frag;
|
||||
|
||||
@ -976,7 +977,7 @@ int NewStore::mount()
|
||||
|
||||
// FIXME: superblock, features
|
||||
|
||||
r = _open_db();
|
||||
r = _open_db(false);
|
||||
if (r < 0)
|
||||
goto out_frag;
|
||||
|
||||
|
@ -461,7 +461,6 @@ private:
|
||||
KeyValueDB *db;
|
||||
FS *fs;
|
||||
uuid_d fsid;
|
||||
string db_path;
|
||||
int path_fd; ///< open handle to $path
|
||||
int fsid_fd; ///< open handle (locked) to $path/fsid
|
||||
int frag_fd; ///< open handle to $path/fragments
|
||||
@ -525,7 +524,7 @@ private:
|
||||
int _open_frag();
|
||||
int _create_frag();
|
||||
void _close_frag();
|
||||
int _open_db();
|
||||
int _open_db(bool create);
|
||||
void _close_db();
|
||||
int _open_collections();
|
||||
void _close_collections();
|
||||
|
Loading…
Reference in New Issue
Block a user