mirror of
https://github.com/ceph/ceph
synced 2025-01-11 13:41:02 +00:00
Merge pull request #8431 from liewegas/wip-bluestore
os/bluestore: revamp BlueFS bdev management and add perfcounters
This commit is contained in:
commit
78d4fada4b
@ -5,6 +5,7 @@
|
||||
|
||||
#include "common/debug.h"
|
||||
#include "common/errno.h"
|
||||
#include "common/perf_counters.h"
|
||||
#include "BlockDevice.h"
|
||||
#include "Allocator.h"
|
||||
#include "StupidAllocator.h"
|
||||
@ -15,23 +16,76 @@
|
||||
#define dout_prefix *_dout << "bluefs "
|
||||
|
||||
BlueFS::BlueFS()
|
||||
: ino_last(0),
|
||||
: logger(NULL),
|
||||
ino_last(0),
|
||||
log_seq(0),
|
||||
log_writer(NULL)
|
||||
log_writer(NULL),
|
||||
bdev(MAX_BDEV),
|
||||
ioc(MAX_BDEV),
|
||||
block_all(MAX_BDEV),
|
||||
block_total(MAX_BDEV, 0)
|
||||
{
|
||||
}
|
||||
|
||||
BlueFS::~BlueFS()
|
||||
{
|
||||
for (auto p : bdev) {
|
||||
p->close();
|
||||
delete p;
|
||||
if (p) {
|
||||
p->close();
|
||||
delete p;
|
||||
}
|
||||
}
|
||||
for (auto p : ioc) {
|
||||
delete p;
|
||||
}
|
||||
}
|
||||
|
||||
void BlueFS::_init_logger()
|
||||
{
|
||||
PerfCountersBuilder b(g_ceph_context, "BlueFS",
|
||||
l_bluefs_first, l_bluefs_last);
|
||||
b.add_u64_counter(l_bluefs_gift_bytes, "gift_bytes", "Bytes gifted from BlueStore");
|
||||
b.add_u64_counter(l_bluefs_reclaim_bytes, "reclaim_bytes", "Bytes reclaimed by BlueStore");
|
||||
b.add_u64(l_bluefs_db_total_bytes, "db_total_bytes", "Total bytes (main db device)");
|
||||
b.add_u64(l_bluefs_db_free_bytes, "db_free_bytes", "Free bytes (main db device)");
|
||||
b.add_u64(l_bluefs_wal_total_bytes, "wal_total_bytes", "Total bytes (wal device)");
|
||||
b.add_u64(l_bluefs_wal_free_bytes, "wal_free_bytes", "Free bytes (wal device)");
|
||||
b.add_u64(l_bluefs_slow_total_bytes, "slow_total_bytes", "Total bytes (slow device)");
|
||||
b.add_u64(l_bluefs_slow_free_bytes, "slow_free_bytes", "Free bytes (slow device)");
|
||||
b.add_u64(l_bluefs_num_files, "num_files", "File count");
|
||||
b.add_u64(l_bluefs_log_bytes, "log_bytes", "Size of the metadata log");
|
||||
b.add_u64_counter(l_bluefs_log_compactions, "log_compactions", "Compactions of the metadata log");
|
||||
b.add_u64_counter(l_bluefs_logged_bytes, "logged_bytes", "Bytes written to the metadata log");
|
||||
logger = b.create_perf_counters();
|
||||
g_ceph_context->get_perfcounters_collection()->add(logger);
|
||||
}
|
||||
|
||||
void BlueFS::_shutdown_logger()
|
||||
{
|
||||
g_ceph_context->get_perfcounters_collection()->remove(logger);
|
||||
delete logger;
|
||||
}
|
||||
|
||||
void BlueFS::_update_logger_stats()
|
||||
{
|
||||
// we must be holding the lock
|
||||
logger->set(l_bluefs_num_files, file_map.size());
|
||||
logger->set(l_bluefs_log_bytes, log_writer->file->fnode.size);
|
||||
|
||||
if (alloc[BDEV_WAL]) {
|
||||
logger->set(l_bluefs_wal_total_bytes, block_total[BDEV_WAL]);
|
||||
logger->set(l_bluefs_wal_free_bytes, alloc[BDEV_WAL]->get_free());
|
||||
}
|
||||
if (alloc[BDEV_DB]) {
|
||||
logger->set(l_bluefs_db_total_bytes, block_total[BDEV_DB]);
|
||||
logger->set(l_bluefs_db_free_bytes, alloc[BDEV_DB]->get_free());
|
||||
}
|
||||
if (alloc[BDEV_SLOW]) {
|
||||
logger->set(l_bluefs_slow_total_bytes, block_total[BDEV_SLOW]);
|
||||
logger->set(l_bluefs_slow_free_bytes, alloc[BDEV_SLOW]->get_free());
|
||||
}
|
||||
}
|
||||
|
||||
/*static void aio_cb(void *priv, void *priv2)
|
||||
{
|
||||
BlueFS *fs = static_cast<BlueFS*>(priv);
|
||||
@ -42,7 +96,8 @@ BlueFS::~BlueFS()
|
||||
int BlueFS::add_block_device(unsigned id, string path)
|
||||
{
|
||||
dout(10) << __func__ << " bdev " << id << " path " << path << dendl;
|
||||
assert(id == bdev.size());
|
||||
assert(id < bdev.size());
|
||||
assert(bdev[id] == NULL);
|
||||
BlockDevice *b = BlockDevice::create(path, NULL, NULL); //aio_cb, this);
|
||||
int r = b->open(path);
|
||||
if (r < 0) {
|
||||
@ -51,15 +106,16 @@ int BlueFS::add_block_device(unsigned id, string path)
|
||||
}
|
||||
dout(1) << __func__ << " bdev " << id << " path " << path
|
||||
<< " size " << pretty_si_t(b->get_size()) << "B" << dendl;
|
||||
bdev.push_back(b);
|
||||
ioc.push_back(new IOContext(NULL));
|
||||
block_all.resize(bdev.size());
|
||||
bdev[id] = b;
|
||||
ioc[id] = new IOContext(NULL);
|
||||
return 0;
|
||||
}
|
||||
|
||||
uint64_t BlueFS::get_block_device_size(unsigned id)
|
||||
{
|
||||
return bdev[id]->get_size();
|
||||
if (bdev[id])
|
||||
return bdev[id]->get_size();
|
||||
return 0;
|
||||
}
|
||||
|
||||
void BlueFS::add_block_extent(unsigned id, uint64_t offset, uint64_t length)
|
||||
@ -68,8 +124,10 @@ void BlueFS::add_block_extent(unsigned id, uint64_t offset, uint64_t length)
|
||||
dout(1) << __func__ << " bdev " << id << " " << offset << "~" << length
|
||||
<< dendl;
|
||||
assert(id < bdev.size());
|
||||
assert(bdev[id]);
|
||||
assert(bdev[id]->get_size() >= offset + length);
|
||||
block_all[id].insert(offset, length);
|
||||
block_total[id] += length;
|
||||
|
||||
if (alloc.size()) {
|
||||
log_t.op_alloc_add(id, offset, length);
|
||||
@ -77,6 +135,9 @@ void BlueFS::add_block_extent(unsigned id, uint64_t offset, uint64_t length)
|
||||
assert(r == 0);
|
||||
alloc[id]->init_add_free(offset, length);
|
||||
}
|
||||
|
||||
if (logger)
|
||||
logger->inc(l_bluefs_gift_bytes, length);
|
||||
dout(10) << __func__ << " done" << dendl;
|
||||
}
|
||||
|
||||
@ -86,6 +147,7 @@ int BlueFS::reclaim_blocks(unsigned id, uint64_t want,
|
||||
std::lock_guard<std::mutex> l(lock);
|
||||
dout(1) << __func__ << " bdev " << id << " want " << want << dendl;
|
||||
assert(id < alloc.size());
|
||||
assert(alloc[id]);
|
||||
int r = alloc[id]->reserve(want);
|
||||
assert(r == 0); // caller shouldn't ask for more than they can get
|
||||
|
||||
@ -96,10 +158,13 @@ int BlueFS::reclaim_blocks(unsigned id, uint64_t want,
|
||||
alloc[id]->unreserve(want - *length);
|
||||
|
||||
block_all[id].erase(*offset, *length);
|
||||
block_total[id] -= *length;
|
||||
log_t.op_alloc_rm(id, *offset, *length);
|
||||
r = _flush_log();
|
||||
assert(r == 0);
|
||||
|
||||
if (logger)
|
||||
logger->inc(l_bluefs_reclaim_bytes, *length);
|
||||
dout(1) << __func__ << " bdev " << id << " want " << want
|
||||
<< " got " << *offset << "~" << *length << dendl;
|
||||
return 0;
|
||||
@ -109,12 +174,7 @@ uint64_t BlueFS::get_total(unsigned id)
|
||||
{
|
||||
std::lock_guard<std::mutex> l(lock);
|
||||
assert(id < block_all.size());
|
||||
uint64_t r = 0;
|
||||
interval_set<uint64_t>& p = block_all[id];
|
||||
for (interval_set<uint64_t>::iterator q = p.begin(); q != p.end(); ++q) {
|
||||
r += q.get_len();
|
||||
}
|
||||
return r;
|
||||
return block_total[id];
|
||||
}
|
||||
|
||||
uint64_t BlueFS::get_free(unsigned id)
|
||||
@ -129,14 +189,14 @@ void BlueFS::get_usage(vector<pair<uint64_t,uint64_t>> *usage)
|
||||
std::lock_guard<std::mutex> l(lock);
|
||||
usage->resize(bdev.size());
|
||||
for (unsigned id = 0; id < bdev.size(); ++id) {
|
||||
uint64_t total = 0;
|
||||
interval_set<uint64_t>& p = block_all[id];
|
||||
for (interval_set<uint64_t>::iterator q = p.begin(); q != p.end(); ++q) {
|
||||
total += q.get_len();
|
||||
if (!bdev[id]) {
|
||||
(*usage)[id] = make_pair(0, 0);
|
||||
continue;
|
||||
}
|
||||
(*usage)[id].first = alloc[id]->get_free();
|
||||
(*usage)[id].second = total;
|
||||
uint64_t used = (total - (*usage)[id].first) * 100 / total;
|
||||
(*usage)[id].second = block_total[id];
|
||||
uint64_t used =
|
||||
(block_total[id] - (*usage)[id].first) * 100 / block_total[id];
|
||||
dout(10) << __func__ << " bdev " << id
|
||||
<< " free " << (*usage)[id].first
|
||||
<< " (" << pretty_si_t((*usage)[id].first) << "B)"
|
||||
@ -162,12 +222,12 @@ int BlueFS::mkfs(uuid_d osd_uuid)
|
||||
dout(1) << __func__
|
||||
<< " osd_uuid " << osd_uuid
|
||||
<< dendl;
|
||||
assert(bdev.size() >= 1);
|
||||
|
||||
_init_alloc();
|
||||
_init_logger();
|
||||
|
||||
super.version = 1;
|
||||
super.block_size = bdev[0]->get_block_size();
|
||||
super.block_size = bdev[BDEV_DB]->get_block_size();
|
||||
super.osd_uuid = osd_uuid;
|
||||
super.uuid.generate_random();
|
||||
dout(1) << __func__ << " uuid " << super.uuid << dendl;
|
||||
@ -175,17 +235,20 @@ int BlueFS::mkfs(uuid_d osd_uuid)
|
||||
// init log
|
||||
FileRef log_file = new File;
|
||||
log_file->fnode.ino = 1;
|
||||
log_file->fnode.prefer_bdev = bdev.size() - 1;
|
||||
int r = _allocate(log_file->fnode.prefer_bdev,
|
||||
g_conf->bluefs_max_log_runway,
|
||||
&log_file->fnode.extents);
|
||||
log_file->fnode.prefer_bdev = BDEV_WAL;
|
||||
int r = _allocate(
|
||||
log_file->fnode.prefer_bdev,
|
||||
g_conf->bluefs_max_log_runway,
|
||||
&log_file->fnode.extents);
|
||||
assert(r == 0);
|
||||
log_writer = new FileWriter(log_file, bdev.size());
|
||||
log_writer = _create_writer(log_file);
|
||||
|
||||
// initial txn
|
||||
log_t.op_init();
|
||||
for (unsigned bdev = 0; bdev < block_all.size(); ++bdev) {
|
||||
for (unsigned bdev = 0; bdev < MAX_BDEV; ++bdev) {
|
||||
interval_set<uint64_t>& p = block_all[bdev];
|
||||
if (p.empty())
|
||||
continue;
|
||||
for (interval_set<uint64_t>::iterator q = p.begin(); q != p.end(); ++q) {
|
||||
dout(20) << __func__ << " op_alloc_add " << bdev << " " << q.get_start()
|
||||
<< "~" << q.get_len() << dendl;
|
||||
@ -204,7 +267,9 @@ int BlueFS::mkfs(uuid_d osd_uuid)
|
||||
_close_writer(log_writer);
|
||||
log_writer = NULL;
|
||||
block_all.clear();
|
||||
block_total.clear();
|
||||
_stop_alloc();
|
||||
_shutdown_logger();
|
||||
|
||||
dout(10) << __func__ << " success" << dendl;
|
||||
return 0;
|
||||
@ -213,8 +278,10 @@ int BlueFS::mkfs(uuid_d osd_uuid)
|
||||
void BlueFS::_init_alloc()
|
||||
{
|
||||
dout(20) << __func__ << dendl;
|
||||
alloc.resize(bdev.size());
|
||||
alloc.resize(MAX_BDEV);
|
||||
for (unsigned id = 0; id < bdev.size(); ++id) {
|
||||
if (!bdev[id])
|
||||
continue;
|
||||
alloc[id] = new StupidAllocator;
|
||||
interval_set<uint64_t>& p = block_all[id];
|
||||
for (interval_set<uint64_t>::iterator q = p.begin(); q != p.end(); ++q) {
|
||||
@ -235,7 +302,6 @@ void BlueFS::_stop_alloc()
|
||||
int BlueFS::mount()
|
||||
{
|
||||
dout(1) << __func__ << dendl;
|
||||
assert(!bdev.empty());
|
||||
|
||||
int r = _open_super();
|
||||
if (r < 0) {
|
||||
@ -244,7 +310,9 @@ int BlueFS::mount()
|
||||
}
|
||||
|
||||
block_all.clear();
|
||||
block_all.resize(bdev.size());
|
||||
block_all.resize(MAX_BDEV);
|
||||
block_total.clear();
|
||||
block_total.resize(MAX_BDEV, 0);
|
||||
_init_alloc();
|
||||
|
||||
r = _replay();
|
||||
@ -263,10 +331,12 @@ int BlueFS::mount()
|
||||
}
|
||||
|
||||
// set up the log for future writes
|
||||
log_writer = new FileWriter(_get_file(1), bdev.size());
|
||||
log_writer = _create_writer(_get_file(1));
|
||||
assert(log_writer->file->fnode.ino == 1);
|
||||
log_writer->pos = log_writer->file->fnode.size;
|
||||
dout(10) << __func__ << " log write pos set to " << log_writer->pos << dendl;
|
||||
|
||||
_init_logger();
|
||||
return 0;
|
||||
|
||||
out:
|
||||
@ -283,12 +353,12 @@ void BlueFS::umount()
|
||||
_close_writer(log_writer);
|
||||
log_writer = NULL;
|
||||
|
||||
block_all.clear();
|
||||
_stop_alloc();
|
||||
file_map.clear();
|
||||
dir_map.clear();
|
||||
super = bluefs_super_t();
|
||||
log_t.clear();
|
||||
_shutdown_logger();
|
||||
}
|
||||
|
||||
int BlueFS::fsck()
|
||||
@ -311,8 +381,8 @@ int BlueFS::_write_super()
|
||||
bl.rebuild();
|
||||
|
||||
IOContext ioc(NULL);
|
||||
bdev[0]->aio_write(get_super_offset(), bl, &ioc, false);
|
||||
bdev[0]->aio_submit(&ioc);
|
||||
bdev[BDEV_DB]->aio_write(get_super_offset(), bl, &ioc, false);
|
||||
bdev[BDEV_DB]->aio_submit(&ioc);
|
||||
ioc.aio_wait();
|
||||
dout(20) << __func__ << " v " << super.version << " crc " << crc
|
||||
<< " offset " << get_super_offset() << dendl;
|
||||
@ -328,8 +398,8 @@ int BlueFS::_open_super()
|
||||
int r;
|
||||
|
||||
// always the second block
|
||||
r = bdev[0]->read(get_super_offset(), get_super_length(),
|
||||
&bl, ioc[0], false);
|
||||
r = bdev[BDEV_DB]->read(get_super_offset(), get_super_length(),
|
||||
&bl, ioc[BDEV_DB], false);
|
||||
if (r < 0)
|
||||
return r;
|
||||
|
||||
@ -458,6 +528,7 @@ int BlueFS::_replay()
|
||||
dout(20) << __func__ << " " << pos << ": op_alloc_add "
|
||||
<< " " << (int)id << ":" << offset << "~" << length << dendl;
|
||||
block_all[id].insert(offset, length);
|
||||
block_total[id] += length;
|
||||
alloc[id]->init_add_free(offset, length);
|
||||
}
|
||||
break;
|
||||
@ -472,6 +543,7 @@ int BlueFS::_replay()
|
||||
dout(20) << __func__ << " " << pos << ": op_alloc_rm "
|
||||
<< " " << (int)id << ":" << offset << "~" << length << dendl;
|
||||
block_all[id].erase(offset, length);
|
||||
block_total[id] -= length;
|
||||
alloc[id]->init_rm_free(offset, length);
|
||||
}
|
||||
break;
|
||||
@ -823,7 +895,7 @@ void BlueFS::_compact_log()
|
||||
t.uuid = super.uuid;
|
||||
dout(20) << __func__ << " op_init" << dendl;
|
||||
t.op_init();
|
||||
for (unsigned bdev = 0; bdev < block_all.size(); ++bdev) {
|
||||
for (unsigned bdev = 0; bdev < MAX_BDEV; ++bdev) {
|
||||
interval_set<uint64_t>& p = block_all[bdev];
|
||||
for (interval_set<uint64_t>::iterator q = p.begin(); q != p.end(); ++q) {
|
||||
dout(20) << __func__ << " op_alloc_add " << bdev << " " << q.get_start()
|
||||
@ -868,7 +940,7 @@ void BlueFS::_compact_log()
|
||||
_close_writer(log_writer);
|
||||
|
||||
log_file->fnode.size = bl.length();
|
||||
log_writer = new FileWriter(log_file, bdev.size());
|
||||
log_writer = _create_writer(log_file);
|
||||
log_writer->append(bl);
|
||||
int r = _flush(log_writer, true);
|
||||
assert(r == 0);
|
||||
@ -884,6 +956,8 @@ void BlueFS::_compact_log()
|
||||
for (auto& r : old_extents) {
|
||||
alloc[r.bdev]->release(r.offset, r.length);
|
||||
}
|
||||
|
||||
logger->inc(l_bluefs_log_compactions);
|
||||
}
|
||||
|
||||
void BlueFS::_pad_bl(bufferlist& bl)
|
||||
@ -922,6 +996,8 @@ int BlueFS::_flush_log()
|
||||
_pad_bl(bl);
|
||||
log_writer->append(bl);
|
||||
|
||||
logger->inc(l_bluefs_logged_bytes, bl.length());
|
||||
|
||||
log_t.clear();
|
||||
log_t.seq = 0; // just so debug output is less confusing
|
||||
|
||||
@ -941,6 +1017,8 @@ int BlueFS::_flush_log()
|
||||
dirty_files.erase(p++);
|
||||
}
|
||||
|
||||
_update_logger_stats();
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
@ -1004,7 +1082,9 @@ int BlueFS::_flush_range(FileWriter *h, uint64_t offset, uint64_t length)
|
||||
length += partial;
|
||||
dout(20) << __func__ << " waiting for previous aio to complete" << dendl;
|
||||
for (auto p : h->iocv) {
|
||||
p->aio_wait();
|
||||
if (p) {
|
||||
p->aio_wait();
|
||||
}
|
||||
}
|
||||
}
|
||||
if (length == partial + h->buffer.length()) {
|
||||
@ -1044,8 +1124,9 @@ int BlueFS::_flush_range(FileWriter *h, uint64_t offset, uint64_t length)
|
||||
++p;
|
||||
x_off = 0;
|
||||
}
|
||||
for (unsigned i = 0; i < bdev.size(); ++i) {
|
||||
if (h->iocv[i]->has_aios()) {
|
||||
for (unsigned i = 0; i < MAX_BDEV; ++i) {
|
||||
if (bdev[i] && h->iocv[i]->has_aios()) {
|
||||
assert(h->iocv[i]);
|
||||
bdev[i]->aio_submit(h->iocv[i]);
|
||||
}
|
||||
}
|
||||
@ -1058,7 +1139,9 @@ void BlueFS::_flush_wait(FileWriter *h)
|
||||
dout(10) << __func__ << " " << h << dendl;
|
||||
utime_t start = ceph_clock_now(NULL);
|
||||
for (auto p : h->iocv) {
|
||||
p->aio_wait();
|
||||
if (p) {
|
||||
p->aio_wait();
|
||||
}
|
||||
}
|
||||
utime_t end = ceph_clock_now(NULL);
|
||||
utime_t dur = end - start;
|
||||
@ -1135,7 +1218,8 @@ void BlueFS::_flush_bdev()
|
||||
{
|
||||
dout(20) << __func__ << dendl;
|
||||
for (auto p : bdev) {
|
||||
p->flush();
|
||||
if (p)
|
||||
p->flush();
|
||||
}
|
||||
}
|
||||
|
||||
@ -1145,16 +1229,24 @@ int BlueFS::_allocate(unsigned id, uint64_t len, vector<bluefs_extent_t> *ev)
|
||||
assert(id < alloc.size());
|
||||
|
||||
uint64_t left = ROUND_UP_TO(len, g_conf->bluefs_alloc_size);
|
||||
int r = alloc[id]->reserve(left);
|
||||
int r = -ENOSPC;
|
||||
if (alloc[id]) {
|
||||
r = alloc[id]->reserve(left);
|
||||
}
|
||||
if (r < 0) {
|
||||
if (id) {
|
||||
derr << __func__ << " failed to allocate " << left << " on bdev " << id
|
||||
<< ", free " << alloc[id]->get_free()
|
||||
<< "; fallback to bdev 0" << dendl;
|
||||
return _allocate(0, len, ev);
|
||||
if (id != BDEV_SLOW) {
|
||||
if (bdev[id])
|
||||
derr << __func__ << " failed to allocate " << left << " on bdev " << id
|
||||
<< ", free " << alloc[id]->get_free()
|
||||
<< "; fallback to bdev " << id + 1 << dendl;
|
||||
return _allocate(id + 1, len, ev);
|
||||
}
|
||||
derr << __func__ << " failed to allocate " << left << " on bdev " << id
|
||||
<< ", free " << alloc[id]->get_free() << dendl;
|
||||
if (bdev[id])
|
||||
derr << __func__ << " failed to allocate " << left << " on bdev " << id
|
||||
<< ", free " << alloc[id]->get_free() << dendl;
|
||||
else
|
||||
derr << __func__ << " failed to allocate " << left << " on bdev " << id
|
||||
<< ", dne" << dendl;
|
||||
return r;
|
||||
}
|
||||
|
||||
@ -1208,11 +1300,15 @@ void BlueFS::sync_metadata()
|
||||
dout(10) << __func__ << dendl;
|
||||
utime_t start = ceph_clock_now(NULL);
|
||||
for (auto p : alloc) {
|
||||
p->commit_start();
|
||||
if (p) {
|
||||
p->commit_start();
|
||||
}
|
||||
}
|
||||
_flush_log();
|
||||
for (auto p : alloc) {
|
||||
p->commit_finish();
|
||||
if (p) {
|
||||
p->commit_finish();
|
||||
}
|
||||
}
|
||||
_maybe_compact_log();
|
||||
utime_t end = ceph_clock_now(NULL);
|
||||
@ -1276,41 +1372,53 @@ int BlueFS::open_for_write(
|
||||
file->fnode.mtime = ceph_clock_now(NULL);
|
||||
}
|
||||
|
||||
file->fnode.prefer_bdev = BlueFS::BDEV_DB;
|
||||
if (dirname.length() > 5) {
|
||||
// the "db.slow" and "db.wal" directory names are hard-coded at
|
||||
// match up with bluestore. the slow device is always the second
|
||||
// one (when a dedicated block.db device is present and used at
|
||||
// bdev 0). the wal device is always last.
|
||||
if (strcmp(dirname.c_str() + dirname.length() - 5, ".slow") == 0) {
|
||||
assert(bdev.size() > 1);
|
||||
dout(20) << __func__ << " mapping " << dirname << "/" << filename
|
||||
<< " to bdev 1" << dendl;
|
||||
file->fnode.prefer_bdev = 1;
|
||||
file->fnode.prefer_bdev = BlueFS::BDEV_SLOW;
|
||||
} else if (strcmp(dirname.c_str() + dirname.length() - 4, ".wal") == 0) {
|
||||
assert(bdev.size() > 1);
|
||||
file->fnode.prefer_bdev = bdev.size() - 1;
|
||||
dout(20) << __func__ << " mapping " << dirname << "/" << filename
|
||||
<< " to bdev " << (int)file->fnode.prefer_bdev << dendl;
|
||||
file->fnode.prefer_bdev = BlueFS::BDEV_WAL;
|
||||
}
|
||||
}
|
||||
dout(20) << __func__ << " mapping " << dirname << "/" << filename
|
||||
<< " to bdev " << (int)file->fnode.prefer_bdev << dendl;
|
||||
|
||||
log_t.op_file_update(file->fnode);
|
||||
if (create)
|
||||
log_t.op_dir_link(dirname, filename, file->fnode.ino);
|
||||
|
||||
*h = new FileWriter(file, bdev.size());
|
||||
*h = _create_writer(file);
|
||||
dout(10) << __func__ << " h " << *h << " on " << file->fnode << dendl;
|
||||
return 0;
|
||||
}
|
||||
|
||||
BlueFS::FileWriter *BlueFS::_create_writer(FileRef f)
|
||||
{
|
||||
FileWriter *w = new FileWriter(f);
|
||||
for (unsigned i = 0; i < MAX_BDEV; ++i) {
|
||||
if (bdev[i]) {
|
||||
w->iocv[i] = new IOContext(NULL);
|
||||
} else {
|
||||
w->iocv[i] = NULL;
|
||||
}
|
||||
}
|
||||
return w;
|
||||
}
|
||||
|
||||
void BlueFS::_close_writer(FileWriter *h)
|
||||
{
|
||||
dout(10) << __func__ << " " << h << dendl;
|
||||
for (unsigned i=0; i<bdev.size(); ++i) {
|
||||
h->iocv[i]->aio_wait();
|
||||
bdev[i]->queue_reap_ioc(h->iocv[i]);
|
||||
for (unsigned i=0; i<MAX_BDEV; ++i) {
|
||||
if (bdev[i]) {
|
||||
assert(h->iocv[i]);
|
||||
h->iocv[i]->aio_wait();
|
||||
bdev[i]->queue_reap_ioc(h->iocv[i]);
|
||||
}
|
||||
}
|
||||
h->iocv.clear();
|
||||
delete h;
|
||||
}
|
||||
|
||||
|
@ -13,10 +13,34 @@
|
||||
#include "boost/intrusive/list.hpp"
|
||||
#include <boost/intrusive_ptr.hpp>
|
||||
|
||||
class PerfCounters;
|
||||
|
||||
class Allocator;
|
||||
|
||||
enum {
|
||||
l_bluefs_first = 732600,
|
||||
l_bluefs_gift_bytes,
|
||||
l_bluefs_reclaim_bytes,
|
||||
l_bluefs_db_total_bytes,
|
||||
l_bluefs_db_free_bytes,
|
||||
l_bluefs_wal_total_bytes,
|
||||
l_bluefs_wal_free_bytes,
|
||||
l_bluefs_slow_total_bytes,
|
||||
l_bluefs_slow_free_bytes,
|
||||
l_bluefs_num_files,
|
||||
l_bluefs_log_bytes,
|
||||
l_bluefs_log_compactions,
|
||||
l_bluefs_logged_bytes,
|
||||
l_bluefs_last,
|
||||
};
|
||||
|
||||
class BlueFS {
|
||||
public:
|
||||
static constexpr unsigned MAX_BDEV = 3;
|
||||
static constexpr unsigned BDEV_WAL = 0;
|
||||
static constexpr unsigned BDEV_DB = 1;
|
||||
static constexpr unsigned BDEV_SLOW = 2;
|
||||
|
||||
struct File : public RefCountedObject {
|
||||
bluefs_fnode_t fnode;
|
||||
int refs;
|
||||
@ -80,22 +104,17 @@ public:
|
||||
bufferlist tail_block; ///< existing partial block at end of file, if any
|
||||
|
||||
std::mutex lock;
|
||||
vector<IOContext*> iocv; ///< one for each bdev
|
||||
std::array<IOContext*,MAX_BDEV> iocv; ///< for each bdev
|
||||
|
||||
FileWriter(FileRef f, unsigned num_bdev)
|
||||
FileWriter(FileRef f)
|
||||
: file(f),
|
||||
pos(0) {
|
||||
++file->num_writers;
|
||||
iocv.resize(num_bdev);
|
||||
for (unsigned i = 0; i < num_bdev; ++i) {
|
||||
iocv[i] = new IOContext(NULL);
|
||||
}
|
||||
}
|
||||
// NOTE: caller must call BlueFS::close_writer()
|
||||
~FileWriter() {
|
||||
--file->num_writers;
|
||||
assert(iocv.empty()); // caller must call BlueFS::close_writer()
|
||||
}
|
||||
|
||||
void append(const char *buf, size_t len) {
|
||||
buffer.append(buf, len);
|
||||
}
|
||||
@ -161,6 +180,8 @@ public:
|
||||
private:
|
||||
std::mutex lock;
|
||||
|
||||
PerfCounters *logger;
|
||||
|
||||
// cache
|
||||
map<string, DirRef> dir_map; ///< dirname -> Dir
|
||||
ceph::unordered_map<uint64_t,FileRef> file_map; ///< ino -> File
|
||||
@ -173,25 +194,22 @@ private:
|
||||
bluefs_transaction_t log_t; ///< pending, unwritten log transaction
|
||||
|
||||
/*
|
||||
* - there can be from 1 to 3 block devices.
|
||||
* There are up to 3 block devices:
|
||||
*
|
||||
* - the first device always has the superblock.
|
||||
*
|
||||
* - if there is a dedicated db device, it is the first device, and the
|
||||
* second device is shared with bluestore. the first device will be
|
||||
* db/, and the second device will be db.slow/.
|
||||
*
|
||||
* - if there is no dedicated db device, then the first device is shared, and
|
||||
* maps to the db/ directory.
|
||||
*
|
||||
* - a wal device, if present, it always the last device. it should be
|
||||
* used for any files in the db.wal/ directory.
|
||||
* BDEV_DB db/ - the primary db device
|
||||
* BDEV_WAL db.wal/ - a small, fast device, specifically for the WAL
|
||||
* BDEV_SLOW db.slow/ - a big, slow device, to spill over to as BDEV_DB fills
|
||||
*/
|
||||
vector<BlockDevice*> bdev; ///< block devices we can use
|
||||
vector<IOContext*> ioc; ///< IOContexts for bdevs
|
||||
vector<interval_set<uint64_t> > block_all; ///< extents in bdev we own
|
||||
vector<uint64_t> block_total; ///< sum of block_all
|
||||
vector<Allocator*> alloc; ///< allocators for bdevs
|
||||
|
||||
void _init_logger();
|
||||
void _shutdown_logger();
|
||||
void _update_logger_stats();
|
||||
|
||||
void _init_alloc();
|
||||
void _stop_alloc();
|
||||
|
||||
@ -237,6 +255,7 @@ private:
|
||||
int _write_super();
|
||||
int _replay(); ///< replay journal
|
||||
|
||||
FileWriter *_create_writer(FileRef f);
|
||||
void _close_writer(FileWriter *h);
|
||||
|
||||
// always put the super in the second 4k block. FIXME should this be
|
||||
|
@ -1145,7 +1145,8 @@ int BlueStore::_open_db(bool create)
|
||||
} else if (s == "0") {
|
||||
do_bluefs = false;
|
||||
} else {
|
||||
derr << __func__ << " bluefs = " << s << " : not 0 or 1, aborting" << dendl;
|
||||
derr << __func__ << " bluefs = " << s << " : not 0 or 1, aborting"
|
||||
<< dendl;
|
||||
return -EIO;
|
||||
}
|
||||
}
|
||||
@ -1162,36 +1163,42 @@ int BlueStore::_open_db(bool create)
|
||||
|
||||
char bfn[PATH_MAX];
|
||||
struct stat st;
|
||||
int id = 0;
|
||||
|
||||
snprintf(bfn, sizeof(bfn), "%s/block.db", path.c_str());
|
||||
if (::stat(bfn, &st) == 0) {
|
||||
r = bluefs->add_block_device(id, bfn);
|
||||
r = bluefs->add_block_device(BlueFS::BDEV_DB, bfn);
|
||||
if (r < 0) {
|
||||
derr << __func__ << " add block device(" << bfn << ") returned: "
|
||||
<< cpp_strerror(r) << dendl;
|
||||
goto free_bluefs;
|
||||
}
|
||||
r = _check_or_set_bdev_label(bfn, bluefs->get_block_device_size(id),
|
||||
r = _check_or_set_bdev_label(
|
||||
bfn,
|
||||
bluefs->get_block_device_size(BlueFS::BDEV_DB),
|
||||
"bluefs db", create);
|
||||
if (r < 0) {
|
||||
derr << __func__ << " check block device(" << bfn << ") label returned: "
|
||||
derr << __func__
|
||||
<< " check block device(" << bfn << ") label returned: "
|
||||
<< cpp_strerror(r) << dendl;
|
||||
goto free_bluefs;
|
||||
}
|
||||
if (create) {
|
||||
bluefs->add_block_extent(
|
||||
id, BLUEFS_START,
|
||||
bluefs->get_block_device_size(id) - BLUEFS_START);
|
||||
BlueFS::BDEV_DB,
|
||||
BLUEFS_START,
|
||||
bluefs->get_block_device_size(BlueFS::BDEV_DB) - BLUEFS_START);
|
||||
}
|
||||
++id;
|
||||
bluefs_shared_bdev = BlueFS::BDEV_SLOW;
|
||||
} else {
|
||||
bluefs_shared_bdev = BlueFS::BDEV_DB;
|
||||
}
|
||||
|
||||
// shared device
|
||||
snprintf(bfn, sizeof(bfn), "%s/block", path.c_str());
|
||||
r = bluefs->add_block_device(id, bfn);
|
||||
r = bluefs->add_block_device(bluefs_shared_bdev, bfn);
|
||||
if (r < 0) {
|
||||
derr << __func__ << " add block device(" << bfn << ") returned: "
|
||||
<< cpp_strerror(r) << dendl;
|
||||
<< cpp_strerror(r) << dendl;
|
||||
goto free_bluefs;
|
||||
}
|
||||
if (create) {
|
||||
@ -1204,21 +1211,23 @@ int BlueStore::_open_db(bool create)
|
||||
// align to bluefs's alloc_size
|
||||
initial = ROUND_UP_TO(initial, g_conf->bluefs_alloc_size);
|
||||
initial += g_conf->bluefs_alloc_size - BLUEFS_START;
|
||||
bluefs->add_block_extent(id, BLUEFS_START, initial);
|
||||
bluefs->add_block_extent(bluefs_shared_bdev, BLUEFS_START, initial);
|
||||
bluefs_extents.insert(BLUEFS_START, initial);
|
||||
}
|
||||
bluefs_shared_bdev = id;
|
||||
++id;
|
||||
if (id == 2) {
|
||||
|
||||
// use a short, relative path, if it's bluefs.
|
||||
strcpy(fn, "db");
|
||||
|
||||
if (bluefs_shared_bdev == BlueFS::BDEV_SLOW) {
|
||||
// we have both block.db and block; tell rocksdb!
|
||||
// note: the second (last) size value doesn't really matter
|
||||
char db_paths[PATH_MAX*3];
|
||||
snprintf(
|
||||
db_paths, sizeof(db_paths), "%s/db,%lld %s/db.slow,%lld",
|
||||
path.c_str(),
|
||||
(unsigned long long)bluefs->get_block_device_size(0) * 95 / 100,
|
||||
path.c_str(),
|
||||
(unsigned long long)bluefs->get_block_device_size(1) * 95 / 100);
|
||||
db_paths, sizeof(db_paths), "db,%lld db.slow,%lld",
|
||||
(unsigned long long)bluefs->get_block_device_size(BlueFS::BDEV_DB) *
|
||||
95 / 100,
|
||||
(unsigned long long)bluefs->get_block_device_size(BlueFS::BDEV_SLOW) *
|
||||
95 / 100);
|
||||
g_conf->set_val("rocksdb_db_paths", db_paths, false, false);
|
||||
dout(10) << __func__ << " set rocksdb_db_paths to "
|
||||
<< g_conf->rocksdb_db_paths << dendl;
|
||||
@ -1226,23 +1235,26 @@ int BlueStore::_open_db(bool create)
|
||||
|
||||
snprintf(bfn, sizeof(bfn), "%s/block.wal", path.c_str());
|
||||
if (::stat(bfn, &st) == 0) {
|
||||
r = bluefs->add_block_device(id, bfn);
|
||||
r = bluefs->add_block_device(BlueFS::BDEV_WAL, bfn);
|
||||
if (r < 0) {
|
||||
derr << __func__ << " add block device(" << bfn << ") returned: "
|
||||
<< cpp_strerror(r) << dendl;
|
||||
goto free_bluefs;
|
||||
}
|
||||
r = _check_or_set_bdev_label(bfn, bluefs->get_block_device_size(id),
|
||||
r = _check_or_set_bdev_label(
|
||||
bfn,
|
||||
bluefs->get_block_device_size(BlueFS::BDEV_WAL),
|
||||
"bluefs wal", create);
|
||||
if (r < 0) {
|
||||
derr << __func__ << " check block device(" << bfn << ") label returned: "
|
||||
derr << __func__ << " check block device(" << bfn << ") label returned: "
|
||||
<< cpp_strerror(r) << dendl;
|
||||
goto free_bluefs;
|
||||
}
|
||||
if (create) {
|
||||
bluefs->add_block_extent(
|
||||
id, BDEV_LABEL_BLOCK_SIZE,
|
||||
bluefs->get_block_device_size(id) - BDEV_LABEL_BLOCK_SIZE);
|
||||
BlueFS::BDEV_WAL, BDEV_LABEL_BLOCK_SIZE,
|
||||
bluefs->get_block_device_size(BlueFS::BDEV_WAL) -
|
||||
BDEV_LABEL_BLOCK_SIZE);
|
||||
}
|
||||
g_conf->set_val("rocksdb_separate_wal_dir", "true");
|
||||
} else {
|
||||
@ -1320,7 +1332,8 @@ int BlueStore::_open_db(bool create)
|
||||
delete bluefs;
|
||||
bluefs = NULL;
|
||||
}
|
||||
// delete env manually here since we can't depend on db to do this under this case
|
||||
// delete env manually here since we can't depend on db to do this
|
||||
// under this case
|
||||
delete env;
|
||||
env = NULL;
|
||||
return -EIO;
|
||||
|
@ -38,8 +38,8 @@ TEST(BlueFS, mkfs) {
|
||||
string fn = get_temp_bdev(size);
|
||||
uuid_d fsid;
|
||||
BlueFS fs;
|
||||
fs.add_block_device(0, fn);
|
||||
fs.add_block_extent(0, 1048576, size - 1048576);
|
||||
fs.add_block_device(BlueFS::BDEV_DB, fn);
|
||||
fs.add_block_extent(BlueFS::BDEV_DB, 1048576, size - 1048576);
|
||||
fs.mkfs(fsid);
|
||||
rm_temp_bdev(fn);
|
||||
}
|
||||
@ -48,13 +48,13 @@ TEST(BlueFS, mkfs_mount) {
|
||||
uint64_t size = 1048476 * 128;
|
||||
string fn = get_temp_bdev(size);
|
||||
BlueFS fs;
|
||||
ASSERT_EQ(0, fs.add_block_device(0, fn));
|
||||
fs.add_block_extent(0, 1048576, size - 1048576);
|
||||
ASSERT_EQ(0, fs.add_block_device(BlueFS::BDEV_DB, fn));
|
||||
fs.add_block_extent(BlueFS::BDEV_DB, 1048576, size - 1048576);
|
||||
uuid_d fsid;
|
||||
ASSERT_EQ(0, fs.mkfs(fsid));
|
||||
ASSERT_EQ(0, fs.mount());
|
||||
ASSERT_EQ(fs.get_total(0), size - 1048576);
|
||||
ASSERT_LT(fs.get_free(0), size - 1048576);
|
||||
ASSERT_EQ(fs.get_total(BlueFS::BDEV_DB), size - 1048576);
|
||||
ASSERT_LT(fs.get_free(BlueFS::BDEV_DB), size - 1048576);
|
||||
fs.umount();
|
||||
rm_temp_bdev(fn);
|
||||
}
|
||||
@ -63,8 +63,8 @@ TEST(BlueFS, write_read) {
|
||||
uint64_t size = 1048476 * 128;
|
||||
string fn = get_temp_bdev(size);
|
||||
BlueFS fs;
|
||||
ASSERT_EQ(0, fs.add_block_device(0, fn));
|
||||
fs.add_block_extent(0, 1048576, size - 1048576);
|
||||
ASSERT_EQ(0, fs.add_block_device(BlueFS::BDEV_DB, fn));
|
||||
fs.add_block_extent(BlueFS::BDEV_DB, 1048576, size - 1048576);
|
||||
uuid_d fsid;
|
||||
ASSERT_EQ(0, fs.mkfs(fsid));
|
||||
ASSERT_EQ(0, fs.mount());
|
||||
@ -99,8 +99,8 @@ TEST(BlueFS, small_appends) {
|
||||
uint64_t size = 1048476 * 128;
|
||||
string fn = get_temp_bdev(size);
|
||||
BlueFS fs;
|
||||
ASSERT_EQ(0, fs.add_block_device(0, fn));
|
||||
fs.add_block_extent(0, 1048576, size - 1048576);
|
||||
ASSERT_EQ(0, fs.add_block_device(BlueFS::BDEV_DB, fn));
|
||||
fs.add_block_extent(BlueFS::BDEV_DB, 1048576, size - 1048576);
|
||||
uuid_d fsid;
|
||||
ASSERT_EQ(0, fs.mkfs(fsid));
|
||||
ASSERT_EQ(0, fs.mount());
|
||||
|
Loading…
Reference in New Issue
Block a user