Merge pull request #8431 from liewegas/wip-bluestore

os/bluestore: revamp BlueFS bdev management and add perfcounters
This commit is contained in:
Sage Weil 2016-04-06 15:48:45 -04:00
commit 78d4fada4b
4 changed files with 263 additions and 123 deletions

View File

@ -5,6 +5,7 @@
#include "common/debug.h"
#include "common/errno.h"
#include "common/perf_counters.h"
#include "BlockDevice.h"
#include "Allocator.h"
#include "StupidAllocator.h"
@ -15,23 +16,76 @@
#define dout_prefix *_dout << "bluefs "
BlueFS::BlueFS()
: ino_last(0),
: logger(NULL),
ino_last(0),
log_seq(0),
log_writer(NULL)
log_writer(NULL),
bdev(MAX_BDEV),
ioc(MAX_BDEV),
block_all(MAX_BDEV),
block_total(MAX_BDEV, 0)
{
}
BlueFS::~BlueFS()
{
for (auto p : bdev) {
p->close();
delete p;
if (p) {
p->close();
delete p;
}
}
for (auto p : ioc) {
delete p;
}
}
void BlueFS::_init_logger()
{
PerfCountersBuilder b(g_ceph_context, "BlueFS",
l_bluefs_first, l_bluefs_last);
b.add_u64_counter(l_bluefs_gift_bytes, "gift_bytes", "Bytes gifted from BlueStore");
b.add_u64_counter(l_bluefs_reclaim_bytes, "reclaim_bytes", "Bytes reclaimed by BlueStore");
b.add_u64(l_bluefs_db_total_bytes, "db_total_bytes", "Total bytes (main db device)");
b.add_u64(l_bluefs_db_free_bytes, "db_free_bytes", "Free bytes (main db device)");
b.add_u64(l_bluefs_wal_total_bytes, "wal_total_bytes", "Total bytes (wal device)");
b.add_u64(l_bluefs_wal_free_bytes, "wal_free_bytes", "Free bytes (wal device)");
b.add_u64(l_bluefs_slow_total_bytes, "slow_total_bytes", "Total bytes (slow device)");
b.add_u64(l_bluefs_slow_free_bytes, "slow_free_bytes", "Free bytes (slow device)");
b.add_u64(l_bluefs_num_files, "num_files", "File count");
b.add_u64(l_bluefs_log_bytes, "log_bytes", "Size of the metadata log");
b.add_u64_counter(l_bluefs_log_compactions, "log_compactions", "Compactions of the metadata log");
b.add_u64_counter(l_bluefs_logged_bytes, "logged_bytes", "Bytes written to the metadata log");
logger = b.create_perf_counters();
g_ceph_context->get_perfcounters_collection()->add(logger);
}
void BlueFS::_shutdown_logger()
{
g_ceph_context->get_perfcounters_collection()->remove(logger);
delete logger;
}
void BlueFS::_update_logger_stats()
{
// we must be holding the lock
logger->set(l_bluefs_num_files, file_map.size());
logger->set(l_bluefs_log_bytes, log_writer->file->fnode.size);
if (alloc[BDEV_WAL]) {
logger->set(l_bluefs_wal_total_bytes, block_total[BDEV_WAL]);
logger->set(l_bluefs_wal_free_bytes, alloc[BDEV_WAL]->get_free());
}
if (alloc[BDEV_DB]) {
logger->set(l_bluefs_db_total_bytes, block_total[BDEV_DB]);
logger->set(l_bluefs_db_free_bytes, alloc[BDEV_DB]->get_free());
}
if (alloc[BDEV_SLOW]) {
logger->set(l_bluefs_slow_total_bytes, block_total[BDEV_SLOW]);
logger->set(l_bluefs_slow_free_bytes, alloc[BDEV_SLOW]->get_free());
}
}
/*static void aio_cb(void *priv, void *priv2)
{
BlueFS *fs = static_cast<BlueFS*>(priv);
@ -42,7 +96,8 @@ BlueFS::~BlueFS()
int BlueFS::add_block_device(unsigned id, string path)
{
dout(10) << __func__ << " bdev " << id << " path " << path << dendl;
assert(id == bdev.size());
assert(id < bdev.size());
assert(bdev[id] == NULL);
BlockDevice *b = BlockDevice::create(path, NULL, NULL); //aio_cb, this);
int r = b->open(path);
if (r < 0) {
@ -51,15 +106,16 @@ int BlueFS::add_block_device(unsigned id, string path)
}
dout(1) << __func__ << " bdev " << id << " path " << path
<< " size " << pretty_si_t(b->get_size()) << "B" << dendl;
bdev.push_back(b);
ioc.push_back(new IOContext(NULL));
block_all.resize(bdev.size());
bdev[id] = b;
ioc[id] = new IOContext(NULL);
return 0;
}
uint64_t BlueFS::get_block_device_size(unsigned id)
{
return bdev[id]->get_size();
if (bdev[id])
return bdev[id]->get_size();
return 0;
}
void BlueFS::add_block_extent(unsigned id, uint64_t offset, uint64_t length)
@ -68,8 +124,10 @@ void BlueFS::add_block_extent(unsigned id, uint64_t offset, uint64_t length)
dout(1) << __func__ << " bdev " << id << " " << offset << "~" << length
<< dendl;
assert(id < bdev.size());
assert(bdev[id]);
assert(bdev[id]->get_size() >= offset + length);
block_all[id].insert(offset, length);
block_total[id] += length;
if (alloc.size()) {
log_t.op_alloc_add(id, offset, length);
@ -77,6 +135,9 @@ void BlueFS::add_block_extent(unsigned id, uint64_t offset, uint64_t length)
assert(r == 0);
alloc[id]->init_add_free(offset, length);
}
if (logger)
logger->inc(l_bluefs_gift_bytes, length);
dout(10) << __func__ << " done" << dendl;
}
@ -86,6 +147,7 @@ int BlueFS::reclaim_blocks(unsigned id, uint64_t want,
std::lock_guard<std::mutex> l(lock);
dout(1) << __func__ << " bdev " << id << " want " << want << dendl;
assert(id < alloc.size());
assert(alloc[id]);
int r = alloc[id]->reserve(want);
assert(r == 0); // caller shouldn't ask for more than they can get
@ -96,10 +158,13 @@ int BlueFS::reclaim_blocks(unsigned id, uint64_t want,
alloc[id]->unreserve(want - *length);
block_all[id].erase(*offset, *length);
block_total[id] -= *length;
log_t.op_alloc_rm(id, *offset, *length);
r = _flush_log();
assert(r == 0);
if (logger)
logger->inc(l_bluefs_reclaim_bytes, *length);
dout(1) << __func__ << " bdev " << id << " want " << want
<< " got " << *offset << "~" << *length << dendl;
return 0;
@ -109,12 +174,7 @@ uint64_t BlueFS::get_total(unsigned id)
{
std::lock_guard<std::mutex> l(lock);
assert(id < block_all.size());
uint64_t r = 0;
interval_set<uint64_t>& p = block_all[id];
for (interval_set<uint64_t>::iterator q = p.begin(); q != p.end(); ++q) {
r += q.get_len();
}
return r;
return block_total[id];
}
uint64_t BlueFS::get_free(unsigned id)
@ -129,14 +189,14 @@ void BlueFS::get_usage(vector<pair<uint64_t,uint64_t>> *usage)
std::lock_guard<std::mutex> l(lock);
usage->resize(bdev.size());
for (unsigned id = 0; id < bdev.size(); ++id) {
uint64_t total = 0;
interval_set<uint64_t>& p = block_all[id];
for (interval_set<uint64_t>::iterator q = p.begin(); q != p.end(); ++q) {
total += q.get_len();
if (!bdev[id]) {
(*usage)[id] = make_pair(0, 0);
continue;
}
(*usage)[id].first = alloc[id]->get_free();
(*usage)[id].second = total;
uint64_t used = (total - (*usage)[id].first) * 100 / total;
(*usage)[id].second = block_total[id];
uint64_t used =
(block_total[id] - (*usage)[id].first) * 100 / block_total[id];
dout(10) << __func__ << " bdev " << id
<< " free " << (*usage)[id].first
<< " (" << pretty_si_t((*usage)[id].first) << "B)"
@ -162,12 +222,12 @@ int BlueFS::mkfs(uuid_d osd_uuid)
dout(1) << __func__
<< " osd_uuid " << osd_uuid
<< dendl;
assert(bdev.size() >= 1);
_init_alloc();
_init_logger();
super.version = 1;
super.block_size = bdev[0]->get_block_size();
super.block_size = bdev[BDEV_DB]->get_block_size();
super.osd_uuid = osd_uuid;
super.uuid.generate_random();
dout(1) << __func__ << " uuid " << super.uuid << dendl;
@ -175,17 +235,20 @@ int BlueFS::mkfs(uuid_d osd_uuid)
// init log
FileRef log_file = new File;
log_file->fnode.ino = 1;
log_file->fnode.prefer_bdev = bdev.size() - 1;
int r = _allocate(log_file->fnode.prefer_bdev,
g_conf->bluefs_max_log_runway,
&log_file->fnode.extents);
log_file->fnode.prefer_bdev = BDEV_WAL;
int r = _allocate(
log_file->fnode.prefer_bdev,
g_conf->bluefs_max_log_runway,
&log_file->fnode.extents);
assert(r == 0);
log_writer = new FileWriter(log_file, bdev.size());
log_writer = _create_writer(log_file);
// initial txn
log_t.op_init();
for (unsigned bdev = 0; bdev < block_all.size(); ++bdev) {
for (unsigned bdev = 0; bdev < MAX_BDEV; ++bdev) {
interval_set<uint64_t>& p = block_all[bdev];
if (p.empty())
continue;
for (interval_set<uint64_t>::iterator q = p.begin(); q != p.end(); ++q) {
dout(20) << __func__ << " op_alloc_add " << bdev << " " << q.get_start()
<< "~" << q.get_len() << dendl;
@ -204,7 +267,9 @@ int BlueFS::mkfs(uuid_d osd_uuid)
_close_writer(log_writer);
log_writer = NULL;
block_all.clear();
block_total.clear();
_stop_alloc();
_shutdown_logger();
dout(10) << __func__ << " success" << dendl;
return 0;
@ -213,8 +278,10 @@ int BlueFS::mkfs(uuid_d osd_uuid)
void BlueFS::_init_alloc()
{
dout(20) << __func__ << dendl;
alloc.resize(bdev.size());
alloc.resize(MAX_BDEV);
for (unsigned id = 0; id < bdev.size(); ++id) {
if (!bdev[id])
continue;
alloc[id] = new StupidAllocator;
interval_set<uint64_t>& p = block_all[id];
for (interval_set<uint64_t>::iterator q = p.begin(); q != p.end(); ++q) {
@ -235,7 +302,6 @@ void BlueFS::_stop_alloc()
int BlueFS::mount()
{
dout(1) << __func__ << dendl;
assert(!bdev.empty());
int r = _open_super();
if (r < 0) {
@ -244,7 +310,9 @@ int BlueFS::mount()
}
block_all.clear();
block_all.resize(bdev.size());
block_all.resize(MAX_BDEV);
block_total.clear();
block_total.resize(MAX_BDEV, 0);
_init_alloc();
r = _replay();
@ -263,10 +331,12 @@ int BlueFS::mount()
}
// set up the log for future writes
log_writer = new FileWriter(_get_file(1), bdev.size());
log_writer = _create_writer(_get_file(1));
assert(log_writer->file->fnode.ino == 1);
log_writer->pos = log_writer->file->fnode.size;
dout(10) << __func__ << " log write pos set to " << log_writer->pos << dendl;
_init_logger();
return 0;
out:
@ -283,12 +353,12 @@ void BlueFS::umount()
_close_writer(log_writer);
log_writer = NULL;
block_all.clear();
_stop_alloc();
file_map.clear();
dir_map.clear();
super = bluefs_super_t();
log_t.clear();
_shutdown_logger();
}
int BlueFS::fsck()
@ -311,8 +381,8 @@ int BlueFS::_write_super()
bl.rebuild();
IOContext ioc(NULL);
bdev[0]->aio_write(get_super_offset(), bl, &ioc, false);
bdev[0]->aio_submit(&ioc);
bdev[BDEV_DB]->aio_write(get_super_offset(), bl, &ioc, false);
bdev[BDEV_DB]->aio_submit(&ioc);
ioc.aio_wait();
dout(20) << __func__ << " v " << super.version << " crc " << crc
<< " offset " << get_super_offset() << dendl;
@ -328,8 +398,8 @@ int BlueFS::_open_super()
int r;
// always the second block
r = bdev[0]->read(get_super_offset(), get_super_length(),
&bl, ioc[0], false);
r = bdev[BDEV_DB]->read(get_super_offset(), get_super_length(),
&bl, ioc[BDEV_DB], false);
if (r < 0)
return r;
@ -458,6 +528,7 @@ int BlueFS::_replay()
dout(20) << __func__ << " " << pos << ": op_alloc_add "
<< " " << (int)id << ":" << offset << "~" << length << dendl;
block_all[id].insert(offset, length);
block_total[id] += length;
alloc[id]->init_add_free(offset, length);
}
break;
@ -472,6 +543,7 @@ int BlueFS::_replay()
dout(20) << __func__ << " " << pos << ": op_alloc_rm "
<< " " << (int)id << ":" << offset << "~" << length << dendl;
block_all[id].erase(offset, length);
block_total[id] -= length;
alloc[id]->init_rm_free(offset, length);
}
break;
@ -823,7 +895,7 @@ void BlueFS::_compact_log()
t.uuid = super.uuid;
dout(20) << __func__ << " op_init" << dendl;
t.op_init();
for (unsigned bdev = 0; bdev < block_all.size(); ++bdev) {
for (unsigned bdev = 0; bdev < MAX_BDEV; ++bdev) {
interval_set<uint64_t>& p = block_all[bdev];
for (interval_set<uint64_t>::iterator q = p.begin(); q != p.end(); ++q) {
dout(20) << __func__ << " op_alloc_add " << bdev << " " << q.get_start()
@ -868,7 +940,7 @@ void BlueFS::_compact_log()
_close_writer(log_writer);
log_file->fnode.size = bl.length();
log_writer = new FileWriter(log_file, bdev.size());
log_writer = _create_writer(log_file);
log_writer->append(bl);
int r = _flush(log_writer, true);
assert(r == 0);
@ -884,6 +956,8 @@ void BlueFS::_compact_log()
for (auto& r : old_extents) {
alloc[r.bdev]->release(r.offset, r.length);
}
logger->inc(l_bluefs_log_compactions);
}
void BlueFS::_pad_bl(bufferlist& bl)
@ -922,6 +996,8 @@ int BlueFS::_flush_log()
_pad_bl(bl);
log_writer->append(bl);
logger->inc(l_bluefs_logged_bytes, bl.length());
log_t.clear();
log_t.seq = 0; // just so debug output is less confusing
@ -941,6 +1017,8 @@ int BlueFS::_flush_log()
dirty_files.erase(p++);
}
_update_logger_stats();
return 0;
}
@ -1004,7 +1082,9 @@ int BlueFS::_flush_range(FileWriter *h, uint64_t offset, uint64_t length)
length += partial;
dout(20) << __func__ << " waiting for previous aio to complete" << dendl;
for (auto p : h->iocv) {
p->aio_wait();
if (p) {
p->aio_wait();
}
}
}
if (length == partial + h->buffer.length()) {
@ -1044,8 +1124,9 @@ int BlueFS::_flush_range(FileWriter *h, uint64_t offset, uint64_t length)
++p;
x_off = 0;
}
for (unsigned i = 0; i < bdev.size(); ++i) {
if (h->iocv[i]->has_aios()) {
for (unsigned i = 0; i < MAX_BDEV; ++i) {
if (bdev[i] && h->iocv[i]->has_aios()) {
assert(h->iocv[i]);
bdev[i]->aio_submit(h->iocv[i]);
}
}
@ -1058,7 +1139,9 @@ void BlueFS::_flush_wait(FileWriter *h)
dout(10) << __func__ << " " << h << dendl;
utime_t start = ceph_clock_now(NULL);
for (auto p : h->iocv) {
p->aio_wait();
if (p) {
p->aio_wait();
}
}
utime_t end = ceph_clock_now(NULL);
utime_t dur = end - start;
@ -1135,7 +1218,8 @@ void BlueFS::_flush_bdev()
{
dout(20) << __func__ << dendl;
for (auto p : bdev) {
p->flush();
if (p)
p->flush();
}
}
@ -1145,16 +1229,24 @@ int BlueFS::_allocate(unsigned id, uint64_t len, vector<bluefs_extent_t> *ev)
assert(id < alloc.size());
uint64_t left = ROUND_UP_TO(len, g_conf->bluefs_alloc_size);
int r = alloc[id]->reserve(left);
int r = -ENOSPC;
if (alloc[id]) {
r = alloc[id]->reserve(left);
}
if (r < 0) {
if (id) {
derr << __func__ << " failed to allocate " << left << " on bdev " << id
<< ", free " << alloc[id]->get_free()
<< "; fallback to bdev 0" << dendl;
return _allocate(0, len, ev);
if (id != BDEV_SLOW) {
if (bdev[id])
derr << __func__ << " failed to allocate " << left << " on bdev " << id
<< ", free " << alloc[id]->get_free()
<< "; fallback to bdev " << id + 1 << dendl;
return _allocate(id + 1, len, ev);
}
derr << __func__ << " failed to allocate " << left << " on bdev " << id
<< ", free " << alloc[id]->get_free() << dendl;
if (bdev[id])
derr << __func__ << " failed to allocate " << left << " on bdev " << id
<< ", free " << alloc[id]->get_free() << dendl;
else
derr << __func__ << " failed to allocate " << left << " on bdev " << id
<< ", dne" << dendl;
return r;
}
@ -1208,11 +1300,15 @@ void BlueFS::sync_metadata()
dout(10) << __func__ << dendl;
utime_t start = ceph_clock_now(NULL);
for (auto p : alloc) {
p->commit_start();
if (p) {
p->commit_start();
}
}
_flush_log();
for (auto p : alloc) {
p->commit_finish();
if (p) {
p->commit_finish();
}
}
_maybe_compact_log();
utime_t end = ceph_clock_now(NULL);
@ -1276,41 +1372,53 @@ int BlueFS::open_for_write(
file->fnode.mtime = ceph_clock_now(NULL);
}
file->fnode.prefer_bdev = BlueFS::BDEV_DB;
if (dirname.length() > 5) {
// the "db.slow" and "db.wal" directory names are hard-coded at
// match up with bluestore. the slow device is always the second
// one (when a dedicated block.db device is present and used at
// bdev 0). the wal device is always last.
if (strcmp(dirname.c_str() + dirname.length() - 5, ".slow") == 0) {
assert(bdev.size() > 1);
dout(20) << __func__ << " mapping " << dirname << "/" << filename
<< " to bdev 1" << dendl;
file->fnode.prefer_bdev = 1;
file->fnode.prefer_bdev = BlueFS::BDEV_SLOW;
} else if (strcmp(dirname.c_str() + dirname.length() - 4, ".wal") == 0) {
assert(bdev.size() > 1);
file->fnode.prefer_bdev = bdev.size() - 1;
dout(20) << __func__ << " mapping " << dirname << "/" << filename
<< " to bdev " << (int)file->fnode.prefer_bdev << dendl;
file->fnode.prefer_bdev = BlueFS::BDEV_WAL;
}
}
dout(20) << __func__ << " mapping " << dirname << "/" << filename
<< " to bdev " << (int)file->fnode.prefer_bdev << dendl;
log_t.op_file_update(file->fnode);
if (create)
log_t.op_dir_link(dirname, filename, file->fnode.ino);
*h = new FileWriter(file, bdev.size());
*h = _create_writer(file);
dout(10) << __func__ << " h " << *h << " on " << file->fnode << dendl;
return 0;
}
BlueFS::FileWriter *BlueFS::_create_writer(FileRef f)
{
FileWriter *w = new FileWriter(f);
for (unsigned i = 0; i < MAX_BDEV; ++i) {
if (bdev[i]) {
w->iocv[i] = new IOContext(NULL);
} else {
w->iocv[i] = NULL;
}
}
return w;
}
void BlueFS::_close_writer(FileWriter *h)
{
dout(10) << __func__ << " " << h << dendl;
for (unsigned i=0; i<bdev.size(); ++i) {
h->iocv[i]->aio_wait();
bdev[i]->queue_reap_ioc(h->iocv[i]);
for (unsigned i=0; i<MAX_BDEV; ++i) {
if (bdev[i]) {
assert(h->iocv[i]);
h->iocv[i]->aio_wait();
bdev[i]->queue_reap_ioc(h->iocv[i]);
}
}
h->iocv.clear();
delete h;
}

View File

@ -13,10 +13,34 @@
#include "boost/intrusive/list.hpp"
#include <boost/intrusive_ptr.hpp>
class PerfCounters;
class Allocator;
enum {
l_bluefs_first = 732600,
l_bluefs_gift_bytes,
l_bluefs_reclaim_bytes,
l_bluefs_db_total_bytes,
l_bluefs_db_free_bytes,
l_bluefs_wal_total_bytes,
l_bluefs_wal_free_bytes,
l_bluefs_slow_total_bytes,
l_bluefs_slow_free_bytes,
l_bluefs_num_files,
l_bluefs_log_bytes,
l_bluefs_log_compactions,
l_bluefs_logged_bytes,
l_bluefs_last,
};
class BlueFS {
public:
static constexpr unsigned MAX_BDEV = 3;
static constexpr unsigned BDEV_WAL = 0;
static constexpr unsigned BDEV_DB = 1;
static constexpr unsigned BDEV_SLOW = 2;
struct File : public RefCountedObject {
bluefs_fnode_t fnode;
int refs;
@ -80,22 +104,17 @@ public:
bufferlist tail_block; ///< existing partial block at end of file, if any
std::mutex lock;
vector<IOContext*> iocv; ///< one for each bdev
std::array<IOContext*,MAX_BDEV> iocv; ///< for each bdev
FileWriter(FileRef f, unsigned num_bdev)
FileWriter(FileRef f)
: file(f),
pos(0) {
++file->num_writers;
iocv.resize(num_bdev);
for (unsigned i = 0; i < num_bdev; ++i) {
iocv[i] = new IOContext(NULL);
}
}
// NOTE: caller must call BlueFS::close_writer()
~FileWriter() {
--file->num_writers;
assert(iocv.empty()); // caller must call BlueFS::close_writer()
}
void append(const char *buf, size_t len) {
buffer.append(buf, len);
}
@ -161,6 +180,8 @@ public:
private:
std::mutex lock;
PerfCounters *logger;
// cache
map<string, DirRef> dir_map; ///< dirname -> Dir
ceph::unordered_map<uint64_t,FileRef> file_map; ///< ino -> File
@ -173,25 +194,22 @@ private:
bluefs_transaction_t log_t; ///< pending, unwritten log transaction
/*
* - there can be from 1 to 3 block devices.
* There are up to 3 block devices:
*
* - the first device always has the superblock.
*
* - if there is a dedicated db device, it is the first device, and the
* second device is shared with bluestore. the first device will be
* db/, and the second device will be db.slow/.
*
* - if there is no dedicated db device, then the first device is shared, and
* maps to the db/ directory.
*
* - a wal device, if present, it always the last device. it should be
* used for any files in the db.wal/ directory.
* BDEV_DB db/ - the primary db device
* BDEV_WAL db.wal/ - a small, fast device, specifically for the WAL
* BDEV_SLOW db.slow/ - a big, slow device, to spill over to as BDEV_DB fills
*/
vector<BlockDevice*> bdev; ///< block devices we can use
vector<IOContext*> ioc; ///< IOContexts for bdevs
vector<interval_set<uint64_t> > block_all; ///< extents in bdev we own
vector<uint64_t> block_total; ///< sum of block_all
vector<Allocator*> alloc; ///< allocators for bdevs
void _init_logger();
void _shutdown_logger();
void _update_logger_stats();
void _init_alloc();
void _stop_alloc();
@ -237,6 +255,7 @@ private:
int _write_super();
int _replay(); ///< replay journal
FileWriter *_create_writer(FileRef f);
void _close_writer(FileWriter *h);
// always put the super in the second 4k block. FIXME should this be

View File

@ -1145,7 +1145,8 @@ int BlueStore::_open_db(bool create)
} else if (s == "0") {
do_bluefs = false;
} else {
derr << __func__ << " bluefs = " << s << " : not 0 or 1, aborting" << dendl;
derr << __func__ << " bluefs = " << s << " : not 0 or 1, aborting"
<< dendl;
return -EIO;
}
}
@ -1162,36 +1163,42 @@ int BlueStore::_open_db(bool create)
char bfn[PATH_MAX];
struct stat st;
int id = 0;
snprintf(bfn, sizeof(bfn), "%s/block.db", path.c_str());
if (::stat(bfn, &st) == 0) {
r = bluefs->add_block_device(id, bfn);
r = bluefs->add_block_device(BlueFS::BDEV_DB, bfn);
if (r < 0) {
derr << __func__ << " add block device(" << bfn << ") returned: "
<< cpp_strerror(r) << dendl;
goto free_bluefs;
}
r = _check_or_set_bdev_label(bfn, bluefs->get_block_device_size(id),
r = _check_or_set_bdev_label(
bfn,
bluefs->get_block_device_size(BlueFS::BDEV_DB),
"bluefs db", create);
if (r < 0) {
derr << __func__ << " check block device(" << bfn << ") label returned: "
derr << __func__
<< " check block device(" << bfn << ") label returned: "
<< cpp_strerror(r) << dendl;
goto free_bluefs;
}
if (create) {
bluefs->add_block_extent(
id, BLUEFS_START,
bluefs->get_block_device_size(id) - BLUEFS_START);
BlueFS::BDEV_DB,
BLUEFS_START,
bluefs->get_block_device_size(BlueFS::BDEV_DB) - BLUEFS_START);
}
++id;
bluefs_shared_bdev = BlueFS::BDEV_SLOW;
} else {
bluefs_shared_bdev = BlueFS::BDEV_DB;
}
// shared device
snprintf(bfn, sizeof(bfn), "%s/block", path.c_str());
r = bluefs->add_block_device(id, bfn);
r = bluefs->add_block_device(bluefs_shared_bdev, bfn);
if (r < 0) {
derr << __func__ << " add block device(" << bfn << ") returned: "
<< cpp_strerror(r) << dendl;
<< cpp_strerror(r) << dendl;
goto free_bluefs;
}
if (create) {
@ -1204,21 +1211,23 @@ int BlueStore::_open_db(bool create)
// align to bluefs's alloc_size
initial = ROUND_UP_TO(initial, g_conf->bluefs_alloc_size);
initial += g_conf->bluefs_alloc_size - BLUEFS_START;
bluefs->add_block_extent(id, BLUEFS_START, initial);
bluefs->add_block_extent(bluefs_shared_bdev, BLUEFS_START, initial);
bluefs_extents.insert(BLUEFS_START, initial);
}
bluefs_shared_bdev = id;
++id;
if (id == 2) {
// use a short, relative path, if it's bluefs.
strcpy(fn, "db");
if (bluefs_shared_bdev == BlueFS::BDEV_SLOW) {
// we have both block.db and block; tell rocksdb!
// note: the second (last) size value doesn't really matter
char db_paths[PATH_MAX*3];
snprintf(
db_paths, sizeof(db_paths), "%s/db,%lld %s/db.slow,%lld",
path.c_str(),
(unsigned long long)bluefs->get_block_device_size(0) * 95 / 100,
path.c_str(),
(unsigned long long)bluefs->get_block_device_size(1) * 95 / 100);
db_paths, sizeof(db_paths), "db,%lld db.slow,%lld",
(unsigned long long)bluefs->get_block_device_size(BlueFS::BDEV_DB) *
95 / 100,
(unsigned long long)bluefs->get_block_device_size(BlueFS::BDEV_SLOW) *
95 / 100);
g_conf->set_val("rocksdb_db_paths", db_paths, false, false);
dout(10) << __func__ << " set rocksdb_db_paths to "
<< g_conf->rocksdb_db_paths << dendl;
@ -1226,23 +1235,26 @@ int BlueStore::_open_db(bool create)
snprintf(bfn, sizeof(bfn), "%s/block.wal", path.c_str());
if (::stat(bfn, &st) == 0) {
r = bluefs->add_block_device(id, bfn);
r = bluefs->add_block_device(BlueFS::BDEV_WAL, bfn);
if (r < 0) {
derr << __func__ << " add block device(" << bfn << ") returned: "
<< cpp_strerror(r) << dendl;
goto free_bluefs;
}
r = _check_or_set_bdev_label(bfn, bluefs->get_block_device_size(id),
r = _check_or_set_bdev_label(
bfn,
bluefs->get_block_device_size(BlueFS::BDEV_WAL),
"bluefs wal", create);
if (r < 0) {
derr << __func__ << " check block device(" << bfn << ") label returned: "
derr << __func__ << " check block device(" << bfn << ") label returned: "
<< cpp_strerror(r) << dendl;
goto free_bluefs;
}
if (create) {
bluefs->add_block_extent(
id, BDEV_LABEL_BLOCK_SIZE,
bluefs->get_block_device_size(id) - BDEV_LABEL_BLOCK_SIZE);
BlueFS::BDEV_WAL, BDEV_LABEL_BLOCK_SIZE,
bluefs->get_block_device_size(BlueFS::BDEV_WAL) -
BDEV_LABEL_BLOCK_SIZE);
}
g_conf->set_val("rocksdb_separate_wal_dir", "true");
} else {
@ -1320,7 +1332,8 @@ int BlueStore::_open_db(bool create)
delete bluefs;
bluefs = NULL;
}
// delete env manually here since we can't depend on db to do this under this case
// delete env manually here since we can't depend on db to do this
// under this case
delete env;
env = NULL;
return -EIO;

View File

@ -38,8 +38,8 @@ TEST(BlueFS, mkfs) {
string fn = get_temp_bdev(size);
uuid_d fsid;
BlueFS fs;
fs.add_block_device(0, fn);
fs.add_block_extent(0, 1048576, size - 1048576);
fs.add_block_device(BlueFS::BDEV_DB, fn);
fs.add_block_extent(BlueFS::BDEV_DB, 1048576, size - 1048576);
fs.mkfs(fsid);
rm_temp_bdev(fn);
}
@ -48,13 +48,13 @@ TEST(BlueFS, mkfs_mount) {
uint64_t size = 1048476 * 128;
string fn = get_temp_bdev(size);
BlueFS fs;
ASSERT_EQ(0, fs.add_block_device(0, fn));
fs.add_block_extent(0, 1048576, size - 1048576);
ASSERT_EQ(0, fs.add_block_device(BlueFS::BDEV_DB, fn));
fs.add_block_extent(BlueFS::BDEV_DB, 1048576, size - 1048576);
uuid_d fsid;
ASSERT_EQ(0, fs.mkfs(fsid));
ASSERT_EQ(0, fs.mount());
ASSERT_EQ(fs.get_total(0), size - 1048576);
ASSERT_LT(fs.get_free(0), size - 1048576);
ASSERT_EQ(fs.get_total(BlueFS::BDEV_DB), size - 1048576);
ASSERT_LT(fs.get_free(BlueFS::BDEV_DB), size - 1048576);
fs.umount();
rm_temp_bdev(fn);
}
@ -63,8 +63,8 @@ TEST(BlueFS, write_read) {
uint64_t size = 1048476 * 128;
string fn = get_temp_bdev(size);
BlueFS fs;
ASSERT_EQ(0, fs.add_block_device(0, fn));
fs.add_block_extent(0, 1048576, size - 1048576);
ASSERT_EQ(0, fs.add_block_device(BlueFS::BDEV_DB, fn));
fs.add_block_extent(BlueFS::BDEV_DB, 1048576, size - 1048576);
uuid_d fsid;
ASSERT_EQ(0, fs.mkfs(fsid));
ASSERT_EQ(0, fs.mount());
@ -99,8 +99,8 @@ TEST(BlueFS, small_appends) {
uint64_t size = 1048476 * 128;
string fn = get_temp_bdev(size);
BlueFS fs;
ASSERT_EQ(0, fs.add_block_device(0, fn));
fs.add_block_extent(0, 1048576, size - 1048576);
ASSERT_EQ(0, fs.add_block_device(BlueFS::BDEV_DB, fn));
fs.add_block_extent(BlueFS::BDEV_DB, 1048576, size - 1048576);
uuid_d fsid;
ASSERT_EQ(0, fs.mkfs(fsid));
ASSERT_EQ(0, fs.mount());