Merge pull request #29425 from aclamk/wip-bluestore-monitor-allocations

[bluestore][tools] Inspect allocations in bluestore

Reviewed-by: Josh Durgin <jdurgin@redhat.com>
Reviewed-by: Igor Fedotov <ifedotov@suse.com>
Reviewed-by: Neha Ojha <nojha@redhat.com>
This commit is contained in:
Neha Ojha 2019-08-07 11:37:34 -07:00 committed by GitHub
commit c9d2833b25
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
18 changed files with 697 additions and 70 deletions

View File

@ -22,6 +22,7 @@ Synopsis
| **ceph-bluestore-tool** bluefs-bdev-new-wal --path *osd path* --dev-target *new-device*
| **ceph-bluestore-tool** bluefs-bdev-new-db --path *osd path* --dev-target *new-device*
| **ceph-bluestore-tool** bluefs-bdev-migrate --path *osd path* --dev-target *new-device* --devs-source *device1* [--devs-source *device2*]
| **ceph-bluestore-tool** free-dump|free-score --path *osd path* [ --allocator block/bluefs-wal/bluefs-db/bluefs-slow ]
Description
@ -81,6 +82,15 @@ Commands
Show device label(s).
:command:`free-dump` --path *osd path* [ --allocator block/bluefs-wal/bluefs-db/bluefs-slow ]
Dump all free regions in allocator.
:command:`free-score` --path *osd path* [ --allocator block/bluefs-wal/bluefs-db/bluefs-slow ]
Give a [0-1] number that represents quality of fragmentation in allocator.
0 represents case when all free space is in one chunk. 1 represents worst possible fragmentation.
Options
=======
@ -117,6 +127,10 @@ Options
deep scrub/repair (read and validate object data, not just metadata)
.. option:: --allocator *name*
Useful for *free-dump* and *free-score* actions. Selects allocator(s).
Device labels
=============

View File

@ -335,6 +335,59 @@ needs to be stopped and BlueFS informed of the device size change with::
ceph-bluestore-tool bluefs-bdev-expand --path /var/lib/ceph/osd/ceph-$ID
BLUEFS_AVAILABLE_SPACE
______________________
To check how much space is free for BlueFS do::
ceph daemon osd.123 bluestore bluefs available
This will output up to 3 values: `BDEV_DB free`, `BDEV_SLOW free` and
`available_from_bluestore`. `BDEV_DB` and `BDEV_SLOW` report amount of space that
has been acquired by BlueFS and is considered free. Value `available_from_bluestore`
denotes ability of BlueStore to relinquish more space to BlueFS.
It is normal that this value is different from amount of BlueStore free space, as
BlueFS allocation unit is typically larger than BlueStore allocation unit.
This means that only part of BlueStore free space will be acceptable for BlueFS.
BLUEFS_LOW_SPACE
_________________
If BlueFS is running low on available free space and there is little
`available_from_bluestore` one can consider reducing BlueFS allocation unit size.
To simulate available space when allocation unit is different do::
ceph daemon osd.123 bluestore bluefs available <alloc-unit-size>
BLUESTORE_FRAGMENTATION
_______________________
As BlueStore works free space on underlying storage will get fragmented.
This is normal and unavoidable but excessive fragmentation will cause slowdown.
To inspect BlueStore fragmentation one can do::
ceph daemon osd.123 bluestore allocator score block
Score is given in [0-1] range.
[0.0 .. 0.4] tiny fragmentation
[0.4 .. 0.7] small, acceptable fragmentation
[0.7 .. 0.9] considerable, but safe fragmentation
[0.9 .. 1.0] severe fragmentation, may impact BlueFS ability to get space from BlueStore
If detailed report of free fragments is required do::
ceph daemon osd.123 bluestore allocator dump block
In case when handling OSD process that is not running fragmentation can be
inspected with `ceph-bluestore-tool`.
Get fragmentation score::
ceph-bluestore-tool --path /var/lib/ceph/osd/ceph-123 --allocator block free-score
And dump detailed free chunks::
ceph-bluestore-tool --path /var/lib/ceph/osd/ceph-123 --allocator block free-dump
BLUESTORE_LEGACY_STATFS
_______________________

View File

@ -317,6 +317,8 @@ bool AdminSocket::do_accept()
c = "foo";
break;
}
//wrap command with new protocol
c = "{\"prefix\": \"" + c + "\"}";
break;
}
} else {
@ -334,8 +336,28 @@ bool AdminSocket::do_accept()
}
}
bool rval = false;
bool rval;
bufferlist out;
rval = execute_command(c, out);
if (rval) {
uint32_t len = htonl(out.length());
int ret = safe_write(connection_fd, &len, sizeof(len));
if (ret < 0) {
lderr(m_cct) << "AdminSocket: error writing response length "
<< cpp_strerror(ret) << dendl;
rval = false;
} else {
if (out.write_fd(connection_fd) >= 0)
rval = true;
}
}
retry_sys_call(::close, connection_fd);
return rval;
}
int AdminSocket::execute_command(const std::string& cmd, ceph::bufferlist& out)
{
cmdmap_t cmdmap;
string format;
vector<string> cmdvec;
@ -343,14 +365,13 @@ bool AdminSocket::do_accept()
cmdvec.push_back(cmd);
if (!cmdmap_from_json(cmdvec, &cmdmap, errss)) {
ldout(m_cct, 0) << "AdminSocket: " << errss.str() << dendl;
retry_sys_call(::close, connection_fd);
return false;
}
string match;
try {
cmd_getval(m_cct, cmdmap, "format", format);
cmd_getval(m_cct, cmdmap, "prefix", c);
cmd_getval(m_cct, cmdmap, "prefix", match);
} catch (const bad_cmd_get& e) {
retry_sys_call(::close, connection_fd);
return false;
}
if (format != "json" && format != "json-pretty" &&
@ -359,7 +380,6 @@ bool AdminSocket::do_accept()
std::unique_lock l(lock);
decltype(hooks)::iterator p;
string match = c;
while (match.size()) {
p = hooks.find(match);
if (p != hooks.cend())
@ -375,53 +395,41 @@ bool AdminSocket::do_accept()
}
}
bufferlist out;
if (p == hooks.cend()) {
lderr(m_cct) << "AdminSocket: request '" << c << "' not defined" << dendl;
} else {
string args;
if (match != c) {
args = c.substr(match.length() + 1);
}
// Drop lock to avoid cycles in cases where the hook takes
// the same lock that was held during calls to register/unregister,
// and set in_hook to allow unregister to wait for us before
// removing this hook.
in_hook = true;
auto match_hook = p->second.hook;
l.unlock();
bool success = (validate(match, cmdmap, out) &&
match_hook->call(match, cmdmap, format, out));
l.lock();
in_hook = false;
in_hook_cond.notify_all();
if (!success) {
ldout(m_cct, 0) << "AdminSocket: request '" << match << "' args '" << args
<< "' to " << match_hook << " failed" << dendl;
out.append("failed");
} else {
ldout(m_cct, 5) << "AdminSocket: request '" << match << "' '" << args
<< "' to " << match_hook
<< " returned " << out.length() << " bytes" << dendl;
}
uint32_t len = htonl(out.length());
int ret = safe_write(connection_fd, &len, sizeof(len));
if (ret < 0) {
lderr(m_cct) << "AdminSocket: error writing response length "
<< cpp_strerror(ret) << dendl;
} else {
if (out.write_fd(connection_fd) >= 0)
rval = true;
}
lderr(m_cct) << "AdminSocket: request '" << cmd << "' not defined" << dendl;
return false;
}
string args;
if (match != cmd) {
args = cmd.substr(match.length() + 1);
}
l.unlock();
retry_sys_call(::close, connection_fd);
return rval;
// Drop lock to avoid cycles in cases where the hook takes
// the same lock that was held during calls to register/unregister,
// and set in_hook to allow unregister to wait for us before
// removing this hook.
in_hook = true;
auto match_hook = p->second.hook;
l.unlock();
bool success = (validate(match, cmdmap, out) &&
match_hook->call(match, cmdmap, format, out));
l.lock();
in_hook = false;
in_hook_cond.notify_all();
if (!success) {
ldout(m_cct, 0) << "AdminSocket: request '" << match << "' args '" << args
<< "' to " << match_hook << " failed" << dendl;
out.append("failed");
} else {
ldout(m_cct, 5) << "AdminSocket: request '" << match << "' '" << args
<< "' to " << match_hook
<< " returned " << out.length() << " bytes" << dendl;
}
return true;
}
bool AdminSocket::validate(const std::string& command,
const cmdmap_t& cmdmap,
bufferlist& out) const

View File

@ -94,6 +94,7 @@ public:
void chown(uid_t uid, gid_t gid);
void chmod(mode_t mode);
int execute_command(const std::string& cmd, ceph::bufferlist& out);
private:

View File

@ -5,20 +5,112 @@
#include "StupidAllocator.h"
#include "BitmapAllocator.h"
#include "common/debug.h"
#include "common/admin_socket.h"
#define dout_subsys ceph_subsys_bluestore
Allocator *Allocator::create(CephContext* cct, string type,
int64_t size, int64_t block_size)
{
if (type == "stupid") {
return new StupidAllocator(cct);
} else if (type == "bitmap") {
return new BitmapAllocator(cct, size, block_size);
class Allocator::SocketHook : public AdminSocketHook {
Allocator *alloc;
std::string name;
public:
explicit SocketHook(Allocator *alloc, const std::string& _name) : alloc(alloc), name(_name)
{
AdminSocket *admin_socket = g_ceph_context->get_admin_socket();
if (name.empty()) {
name = to_string((uintptr_t)this);
}
if (admin_socket) {
int r = admin_socket->register_command(("bluestore allocator dump " + name).c_str(),
("bluestore allocator dump " + name).c_str(),
this,
"dump allocator free regions");
if (r != 0)
alloc = nullptr; //some collision, disable
if (alloc) {
r = admin_socket->register_command(("bluestore allocator score " + name).c_str(),
("bluestore allocator score " + name).c_str(),
this,
"give score on allocator fragmentation (0-no fragmentation, 1-absolute fragmentation)");
ceph_assert(r == 0);
}
}
}
lderr(cct) << "Allocator::" << __func__ << " unknown alloc type "
~SocketHook()
{
AdminSocket *admin_socket = g_ceph_context->get_admin_socket();
if (admin_socket && alloc) {
int r = admin_socket->unregister_command(("bluestore allocator dump " + name).c_str());
ceph_assert(r == 0);
r = admin_socket->unregister_command(("bluestore allocator score " + name).c_str());
ceph_assert(r == 0);
}
}
bool call(std::string_view command, const cmdmap_t& cmdmap,
std::string_view format, bufferlist& out) override {
stringstream ss;
bool r = true;
if (command == "bluestore allocator dump " + name) {
Formatter *f = Formatter::create(format, "json-pretty", "json-pretty");
f->open_array_section("free_regions");
auto iterated_allocation = [&](size_t off, size_t len) {
ceph_assert(len > 0);
f->open_object_section("free");
char off_hex[30];
char len_hex[30];
snprintf(off_hex, sizeof(off_hex) - 1, "0x%lx", off);
snprintf(len_hex, sizeof(len_hex) - 1, "0x%lx", len);
f->dump_string("offset", off_hex);
f->dump_string("length", len_hex);
f->close_section();
};
alloc->dump(iterated_allocation);
f->close_section();
f->flush(ss);
} else if (command == "bluestore allocator score " + name) {
Formatter *f = Formatter::create(format, "json-pretty", "json-pretty");
f->open_object_section("fragmentation_score");
f->dump_float("fragmentation_rating", alloc->get_fragmentation_score());
f->close_section();
f->flush(ss);
delete f;
} else {
ss << "Invalid command" << std::endl;
r = false;
}
out.append(ss);
return r;
}
};
Allocator::Allocator(const std::string& name)
{
asok_hook = new SocketHook(this, name);
}
Allocator::~Allocator()
{
delete asok_hook;
}
Allocator *Allocator::create(CephContext* cct, string type,
int64_t size, int64_t block_size, const std::string& name)
{
Allocator* alloc = nullptr;
if (type == "stupid") {
alloc = new StupidAllocator(cct, name);
} else if (type == "bitmap") {
alloc = new BitmapAllocator(cct, size, block_size, name);
}
if (alloc == nullptr) {
lderr(cct) << "Allocator::" << __func__ << " unknown alloc type "
<< type << dendl;
return nullptr;
}
return alloc;
}
void Allocator::release(const PExtentVector& release_vec)
@ -29,3 +121,55 @@ void Allocator::release(const PExtentVector& release_vec)
}
release(release_set);
}
/**
* Gives fragmentation a numeric value.
*
* Following algorithm applies value to each existing free unallocated block.
* Value of single block is a multiply of size and per-byte-value.
* Per-byte-value is greater for larger blocks.
* Assume block size X has value per-byte p; then block size 2*X will have per-byte value 1.1*p.
*
* This could be expressed in logarithms, but for speed this is interpolated inside ranges.
* [1] [2..3] [4..7] [8..15] ...
* ^ ^ ^ ^
* 1.1 1.1^2 1.1^3 1.1^4 ...
*
* Final score is obtained by proportion between score that would have been obtained
* in condition of absolute fragmentation and score in no fragmentation at all.
*/
double Allocator::get_fragmentation_score()
{
// this value represents how much worth is 2X bytes in one chunk then in X + X bytes
static const double double_size_worth = 1.1 ;
std::vector<double> scales{1};
double score_sum = 0;
size_t sum = 0;
auto get_score = [&](size_t v) -> double {
size_t sc = sizeof(v) * 8 - clz(v) - 1; //assign to grade depending on log2(len)
while (scales.size() <= sc + 1) {
//unlikely expand scales vector
scales.push_back(scales[scales.size() - 1] * double_size_worth);
}
size_t sc_shifted = size_t(1) << sc;
double x = double(v - sc_shifted) / sc_shifted; //x is <0,1) in its scale grade
// linear extrapolation in its scale grade
double score = (sc_shifted ) * scales[sc] * (1-x) +
(sc_shifted * 2) * scales[sc+1] * x;
return score;
};
auto iterated_allocation = [&](size_t off, size_t len) {
ceph_assert(len > 0);
score_sum += get_score(len);
sum += len;
};
dump(iterated_allocation);
double ideal = get_score(sum);
double terrible = sum * get_score(1);
return (ideal - score_sum) / (ideal - terrible);
}

View File

@ -15,10 +15,12 @@
#include <ostream>
#include "include/ceph_assert.h"
#include "os/bluestore/bluestore_types.h"
#include <functional>
class Allocator {
public:
virtual ~Allocator() {}
explicit Allocator(const std::string& name);
virtual ~Allocator();
/*
* Allocate required number of blocks in n number of extents.
@ -44,6 +46,7 @@ public:
void release(const PExtentVector& release_set);
virtual void dump() = 0;
virtual void dump(std::function<void(uint64_t offset, uint64_t length)> notify) = 0;
virtual void init_add_free(uint64_t offset, uint64_t length) = 0;
virtual void init_rm_free(uint64_t offset, uint64_t length) = 0;
@ -53,10 +56,14 @@ public:
{
return 0.0;
}
virtual double get_fragmentation_score();
virtual void shutdown() = 0;
static Allocator *create(CephContext* cct, string type, int64_t size,
int64_t block_size);
int64_t block_size, const std::string& name = "");
private:
class SocketHook;
SocketHook* asok_hook = nullptr;
};
#endif

View File

@ -10,7 +10,9 @@
BitmapAllocator::BitmapAllocator(CephContext* _cct,
int64_t capacity,
int64_t alloc_unit) :
int64_t alloc_unit,
const std::string& name) :
Allocator(name),
cct(_cct)
{
ldout(cct, 10) << __func__ << " 0x" << std::hex << capacity << "/"
@ -100,3 +102,13 @@ void BitmapAllocator::dump()
++it;
}
}
void BitmapAllocator::dump(std::function<void(uint64_t offset, uint64_t length)> notify)
{
size_t alloc_size = get_min_alloc_size();
auto multiply_by_alloc_size = [alloc_size, notify](size_t off, size_t len) {
notify(off * alloc_size, len * alloc_size);
};
std::lock_guard lck(lock);
l1.dump(multiply_by_alloc_size);
}

View File

@ -17,7 +17,7 @@ class BitmapAllocator : public Allocator,
CephContext* cct;
public:
BitmapAllocator(CephContext* _cct, int64_t capacity, int64_t alloc_unit);
BitmapAllocator(CephContext* _cct, int64_t capacity, int64_t alloc_unit, const std::string& name);
~BitmapAllocator() override
{
}
@ -36,6 +36,7 @@ public:
}
void dump() override;
void dump(std::function<void(uint64_t offset, uint64_t length)> notify) override;
double get_fragmentation(uint64_t) override
{
return _get_fragmentation();

View File

@ -10,6 +10,7 @@
#include "BlockDevice.h"
#include "Allocator.h"
#include "include/ceph_assert.h"
#include "common/admin_socket.h"
#define dout_context cct
#define dout_subsys ceph_subsys_bluefs
@ -42,6 +43,78 @@ static void slow_discard_cb(void *priv, void* priv2) {
bluefs->handle_discard(BlueFS::BDEV_SLOW, *tmp);
}
class BlueFS::SocketHook : public AdminSocketHook {
BlueFS* bluefs;
public:
static BlueFS::SocketHook* create(BlueFS* bluefs)
{
BlueFS::SocketHook* hook = nullptr;
AdminSocket* admin_socket = bluefs->cct->get_admin_socket();
if (admin_socket) {
hook = new BlueFS::SocketHook(bluefs);
int r = admin_socket->register_command("bluestore bluefs available",
"bluestore bluefs available "
"name=alloc_size,type=CephInt,req=false",
hook,
"Report available space for bluefs. "
"If alloc_size set, make simulation.");
if (r != 0) {
ldout(bluefs->cct, 1) << __func__ << " cannot register SocketHook" << dendl;
delete hook;
hook = nullptr;
}
}
return hook;
}
~SocketHook() {
AdminSocket* admin_socket = bluefs->cct->get_admin_socket();
int r = admin_socket->unregister_command("bluestore bluefs available");
ceph_assert(r == 0);
}
private:
SocketHook(BlueFS* bluefs) :
bluefs(bluefs) {}
bool call(std::string_view command, const cmdmap_t& cmdmap,
std::string_view format, bufferlist& out) override {
stringstream ss;
bool r = true;
if (command == "bluestore bluefs available") {
int64_t alloc_size = 0;
cmd_getval(bluefs->cct, cmdmap, "alloc_size", alloc_size);
if ((alloc_size & (alloc_size - 1)) != 0) {
ss << "Invalid allocation size:'" << alloc_size << std::endl;
}
if (alloc_size == 0)
alloc_size = bluefs->cct->_conf->bluefs_alloc_size;
Formatter *f = Formatter::create(format, "json-pretty", "json-pretty");
f->open_object_section("bluefs_available_space");
for (unsigned dev = BDEV_WAL; dev <= BDEV_SLOW; dev++) {
if (bluefs->bdev[dev]) {
f->open_object_section("dev");
f->dump_string("device", bluefs->get_device_name(dev));
ceph_assert(bluefs->alloc[dev]);
f->dump_int("free", bluefs->alloc[dev]->get_free());
f->close_section();
}
}
size_t extra_space = 0;
if (bluefs->slow_dev_expander) {
extra_space = bluefs->slow_dev_expander->available_freespace(alloc_size);
}
f->dump_int("available_from_bluestore", extra_space);
f->close_section();
f->flush(ss);
delete f;
} else {
ss << "Invalid command" << std::endl;
r = false;
}
out.append(ss);
return r;
}
};
BlueFS::BlueFS(CephContext* cct)
: cct(cct),
bdev(MAX_BDEV),
@ -51,10 +124,12 @@ BlueFS::BlueFS(CephContext* cct)
discard_cb[BDEV_WAL] = wal_discard_cb;
discard_cb[BDEV_DB] = db_discard_cb;
discard_cb[BDEV_SLOW] = slow_discard_cb;
asok_hook = SocketHook::create(this);
}
BlueFS::~BlueFS()
{
delete asok_hook;
for (auto p : ioc) {
if (p)
p->aio_wait();
@ -441,9 +516,15 @@ void BlueFS::_init_alloc()
continue;
}
ceph_assert(bdev[id]->get_size());
std::string name = "bluefs-";
const char* devnames[] = {"wal","db","slow"};
if (id <= BDEV_SLOW)
name += devnames[id];
else
name += to_string(uintptr_t(this));
alloc[id] = Allocator::create(cct, cct->_conf->bluefs_allocator,
bdev[id]->get_size(),
cct->_conf->bluefs_alloc_size);
cct->_conf->bluefs_alloc_size, name);
interval_set<uint64_t>& p = block_all[id];
for (interval_set<uint64_t>::iterator q = p.begin(); q != p.end(); ++q) {
alloc[id]->init_add_free(q.get_start(), q.get_len());
@ -2466,6 +2547,13 @@ void BlueFS::flush_bdev()
}
}
const char* BlueFS::get_device_name(unsigned id)
{
if (id >= MAX_BDEV) return "BDEV_INV";
const char* names[] = {"BDEV_WAL", "BDEV_DB", "BDEV_SLOW", "BDEV_NEWWAL", "BDEV_NEWDB"};
return names[id];
}
int BlueFS::_expand_slow_device(uint64_t need, PExtentVector& extents)
{
int r = -ENOSPC;

View File

@ -63,6 +63,13 @@ public:
uint64_t min_size,
uint64_t size,
PExtentVector& extents) = 0;
/** Reports amount of space that can be transferred to BlueFS.
* This gives either current state, when alloc_size is currently used
* BlueFS's size, or simulation when alloc_size is different.
* @params
* alloc_size - allocation unit size to check
*/
virtual size_t available_freespace(uint64_t alloc_size) = 0;
};
class BlueFS {
@ -305,6 +312,9 @@ private:
BlueFSDeviceExpander* slow_dev_expander = nullptr;
class SocketHook;
SocketHook* asok_hook = nullptr;
void _init_logger();
void _shutdown_logger();
void _update_logger_stats();
@ -318,6 +328,7 @@ private:
void _drop_link(FileRef f);
int _get_slow_device_id() { return bdev[BDEV_SLOW] ? BDEV_SLOW : BDEV_DB; }
const char* get_device_name(unsigned id);
int _expand_slow_device(uint64_t min_size, PExtentVector& extents);
int _allocate(uint8_t bdev, uint64_t len,
bluefs_fnode_t* node);

View File

@ -4926,7 +4926,7 @@ int BlueStore::_open_alloc()
alloc = Allocator::create(cct, cct->_conf->bluestore_allocator,
bdev->get_size(),
min_alloc_size);
min_alloc_size, "block");
if (!alloc) {
lderr(cct) << __func__ << " Allocator::unknown alloc type "
<< cct->_conf->bluestore_allocator
@ -5704,6 +5704,25 @@ int BlueStore::allocate_bluefs_freespace(
return 0;
}
size_t BlueStore::available_freespace(uint64_t alloc_size) {
size_t total = 0;
auto iterated_allocation = [&](size_t off, size_t len) {
//only count in size that is alloc_size aligned
size_t dist_to_alignment;
size_t offset_in_block = off & (alloc_size - 1);
if (offset_in_block == 0)
dist_to_alignment = 0;
else
dist_to_alignment = alloc_size - offset_in_block;
if (dist_to_alignment >= len)
return;
len -= dist_to_alignment;
total += p2align(len, alloc_size);
};
alloc->dump(iterated_allocation);
return total;
}
int64_t BlueStore::_get_bluefs_size_delta(uint64_t bluefs_free, uint64_t bluefs_total)
{
float bluefs_free_ratio = (float)bluefs_free / (float)bluefs_total;
@ -6755,6 +6774,48 @@ int BlueStore::umount()
return 0;
}
int BlueStore::cold_open()
{
int r = _open_path();
if (r < 0)
return r;
r = _open_fsid(false);
if (r < 0)
goto out_path;
r = _read_fsid(&fsid);
if (r < 0)
goto out_fsid;
r = _lock_fsid();
if (r < 0)
goto out_fsid;
r = _open_bdev(false);
if (r < 0)
goto out_fsid;
r = _open_db_and_around(true);
if (r < 0) {
goto out_bdev;
}
return 0;
out_bdev:
_close_bdev();
out_fsid:
_close_fsid();
out_path:
_close_path();
return r;
}
int BlueStore::cold_close()
{
_close_db_and_around();
_close_bdev();
_close_fsid();
_close_path();
return 0;
}
static void apply(uint64_t off,
uint64_t len,
uint64_t granularity,

View File

@ -2293,6 +2293,8 @@ public:
int write_meta(const std::string& key, const std::string& value) override;
int read_meta(const std::string& key, std::string *value) override;
int cold_open();
int cold_close();
int fsck(bool deep) override {
return _fsck(deep, false);
@ -2894,7 +2896,7 @@ private:
PExtentVector& extents) override {
return allocate_bluefs_freespace(min_size, size, &extents);
};
size_t available_freespace(uint64_t alloc_size) override;
inline bool _use_rotational_settings();
};

View File

@ -10,8 +10,8 @@
#undef dout_prefix
#define dout_prefix *_dout << "stupidalloc 0x" << this << " "
StupidAllocator::StupidAllocator(CephContext* cct)
: cct(cct), num_free(0),
StupidAllocator::StupidAllocator(CephContext* cct, const std::string& name)
: Allocator(name), cct(cct), num_free(0),
free(10),
last_alloc(0)
{
@ -293,6 +293,16 @@ void StupidAllocator::dump()
}
}
void StupidAllocator::dump(std::function<void(uint64_t offset, uint64_t length)> notify)
{
std::lock_guard l(lock);
for (unsigned bin = 0; bin < free.size(); ++bin) {
for (auto p = free[bin].begin(); p != free[bin].end(); ++p) {
notify(p.get_start(), p.get_len());
}
}
}
void StupidAllocator::init_add_free(uint64_t offset, uint64_t length)
{
std::lock_guard l(lock);

View File

@ -35,7 +35,7 @@ class StupidAllocator : public Allocator {
uint64_t alloc_unit);
public:
StupidAllocator(CephContext* cct);
StupidAllocator(CephContext* cct, const std::string& name = "");
~StupidAllocator() override;
int64_t allocate(
@ -53,6 +53,7 @@ public:
double get_fragmentation(uint64_t alloc_unit) override;
void dump() override;
void dump(std::function<void(uint64_t offset, uint64_t length)> notify) override;
void init_add_free(uint64_t offset, uint64_t length) override;
void init_rm_free(uint64_t offset, uint64_t length) override;

View File

@ -18,6 +18,7 @@
#include "os/bluestore/BlueFS.h"
#include "os/bluestore/BlueStore.h"
#include "common/admin_socket.h"
namespace po = boost::program_options;
@ -224,6 +225,7 @@ int main(int argc, char **argv)
string action;
string log_file;
string key, value;
vector<string> allocs_name;
int log_level = 30;
bool fsck_deep = false;
po::options_description po_options("Options");
@ -239,10 +241,26 @@ int main(int argc, char **argv)
("deep", po::value<bool>(&fsck_deep), "deep fsck (read all data)")
("key,k", po::value<string>(&key), "label metadata key name")
("value,v", po::value<string>(&value), "label metadata value")
("allocator", po::value<vector<string>>(&allocs_name), "allocator to inspect: 'block'/'bluefs-wal'/'bluefs-db'/'bluefs-slow'")
;
po::options_description po_positional("Positional options");
po_positional.add_options()
("command", po::value<string>(&action), "fsck, repair, bluefs-export, bluefs-bdev-sizes, bluefs-bdev-expand, bluefs-bdev-new-db, bluefs-bdev-new-wal, bluefs-bdev-migrate, show-label, set-label-key, rm-label-key, prime-osd-dir, bluefs-log-dump")
("command", po::value<string>(&action),
"fsck, "
"repair, "
"bluefs-export, "
"bluefs-bdev-sizes, "
"bluefs-bdev-expand, "
"bluefs-bdev-new-db, "
"bluefs-bdev-new-wal, "
"bluefs-bdev-migrate, "
"show-label, "
"set-label-key, "
"rm-label-key, "
"prime-osd-dir, "
"bluefs-log-dump, "
"free-dump, "
"free-score")
;
po::options_description po_all("All options");
po_all.add(po_options).add(po_positional);
@ -357,7 +375,24 @@ int main(int argc, char **argv)
exit(EXIT_FAILURE);
}
}
if (action == "free-score" || action == "free-dump") {
if (path.empty()) {
cerr << "must specify bluestore path" << std::endl;
exit(EXIT_FAILURE);
}
for (auto name : allocs_name) {
if (!name.empty() &&
name != "block" &&
name != "bluefs-db" &&
name != "bluefs-wal" &&
name != "bluefs-slow") {
cerr << "unknown allocator '" << name << "'" << std::endl;
exit(EXIT_FAILURE);
}
}
if (allocs_name.empty())
allocs_name = vector<string>{"block", "bluefs-db", "bluefs-wal", "bluefs-slow"};
}
vector<const char*> args;
if (log_file.size()) {
args.push_back("--log-file");
@ -792,6 +827,31 @@ int main(int argc, char **argv)
}
return r;
}
} else if (action == "free-dump" || action == "free-score") {
AdminSocket *admin_socket = g_ceph_context->get_admin_socket();
ceph_assert(admin_socket);
std::string action_name = action == "free-dump" ? "dump" : "score";
validate_path(cct.get(), path, false);
BlueStore bluestore(cct.get(), path);
int r = bluestore.cold_open();
if (r < 0) {
cerr << "error from cold_open: " << cpp_strerror(r) << std::endl;
exit(EXIT_FAILURE);
}
for (auto alloc_name : allocs_name) {
ceph::bufferlist out;
bool b = admin_socket->execute_command(
"{\"prefix\": \"bluestore allocator " + action_name + " " + alloc_name + "\"}", out);
if (!b) {
cerr << "failure querying '" << alloc_name << "'" << std::endl;
exit(EXIT_FAILURE);
}
cout << alloc_name << ":" << std::endl;
cout << std::string(out.c_str(),out.length()) << std::endl;
}
bluestore.cold_close();
} else {
cerr << "unrecognized action " << action << std::endl;
return 1;

View File

@ -544,3 +544,88 @@ void AllocatorLevel01Loose::collect_stats(
bins_overall[cbits(free_seq_cnt) - 1]++;
}
}
inline ssize_t AllocatorLevel01Loose::count_0s(slot_t slot_val, size_t start_pos)
{
#ifdef __GNUC__
size_t pos = __builtin_ffsll(slot_val >> start_pos);
if (pos == 0)
return sizeof(slot_t)*8 - start_pos;
return pos - 1;
#else
size_t pos = start_pos;
slot_t mask = slot_t(1) << pos;
while (pos < bits_per_slot && (slot_val & mask) == 0) {
mask <<= 1;
pos++;
}
return pos - start_pos;
#endif
}
inline ssize_t AllocatorLevel01Loose::count_1s(slot_t slot_val, size_t start_pos)
{
return count_0s(~slot_val, start_pos);
}
void AllocatorLevel01Loose::dump(
std::function<void(uint64_t offset, uint64_t length)> notify)
{
size_t len = 0;
size_t off = 0;
for (size_t i = 0; i < l1.size(); i++)
{
for (size_t j = 0; j < L1_ENTRIES_PER_SLOT * L1_ENTRY_WIDTH; j += L1_ENTRY_WIDTH)
{
size_t w = (l1[i] >> j) & L1_ENTRY_MASK;
switch (w) {
case L1_ENTRY_FULL:
if (len > 0) {
notify(off, len);
len = 0;
}
break;
case L1_ENTRY_FREE:
if (len == 0)
off = ( ( bits_per_slot * i + j ) / L1_ENTRY_WIDTH ) * slots_per_slotset * bits_per_slot;
len += bits_per_slotset;
break;
case L1_ENTRY_PARTIAL:
size_t pos = ( ( bits_per_slot * i + j ) / L1_ENTRY_WIDTH ) * slots_per_slotset;
for (size_t t = 0; t < slots_per_slotset; t++) {
size_t p = 0;
slot_t allocation_pattern = l0[pos + t];
while (p < bits_per_slot) {
if (len == 0) {
//continue to skip allocated space, meaning bits set to 0
ssize_t alloc_count = count_0s(allocation_pattern, p);
p += alloc_count;
//now we are switched to expecting free space
if (p < bits_per_slot) {
//now @p are 1s
ssize_t free_count = count_1s(allocation_pattern, p);
assert(free_count > 0);
len = free_count;
off = (pos + t) * bits_per_slot + p;
p += free_count;
}
} else {
//continue free region
ssize_t free_count = count_1s(allocation_pattern, p);
if (free_count == 0) {
notify(off, len);
len = 0;
} else {
p += free_count;
len += free_count;
}
}
}
}
break;
}
}
}
if (len > 0)
notify(off, len);
}

View File

@ -46,6 +46,7 @@ typedef mempool::bluestore_alloc::vector<slot_t> slot_vector_t;
// fitting into cache line on x86_64
static const size_t slotset_width = 8; // 8 slots per set
static const size_t slots_per_slotset = 8;
static const size_t slotset_bytes = sizeof(slot_t) * slotset_width;
static const size_t bits_per_slot = sizeof(slot_t) * 8;
static const size_t bits_per_slotset = slotset_bytes * 8;
@ -141,6 +142,7 @@ class AllocatorLevel01Loose : public AllocatorLevel01
L1_ENTRY_NOT_USED = 0x02,
L1_ENTRY_FREE = 0x03,
CHILD_PER_SLOT = bits_per_slot / L1_ENTRY_WIDTH, // 32
L1_ENTRIES_PER_SLOT = bits_per_slot / L1_ENTRY_WIDTH, //32
CHILD_PER_SLOT_L0 = bits_per_slot, // 64
};
uint64_t _children_per_slot() const override
@ -469,8 +471,13 @@ public:
}
void collect_stats(
std::map<size_t, size_t>& bins_overall) override;
static inline ssize_t count_0s(slot_t slot_val, size_t start_pos);
static inline ssize_t count_1s(slot_t slot_val, size_t start_pos);
void dump(std::function<void(uint64_t offset, uint64_t length)> notify);
};
class AllocatorLevel01Compact : public AllocatorLevel01
{
uint64_t _children_per_slot() const override

View File

@ -294,6 +294,68 @@ TEST_P(AllocTest, test_alloc_fragmentation)
EXPECT_EQ(0u, uint64_t(alloc->get_fragmentation(alloc_unit) * 100));
}
TEST_P(AllocTest, test_dump_fragmentation_score)
{
uint64_t capacity = 1024 * 1024 * 1024;
uint64_t one_alloc_max = 2 * 1024 * 1024;
uint64_t alloc_unit = 4096;
uint64_t want_size = alloc_unit;
uint64_t rounds = 10;
uint64_t actions_per_round = 1000;
PExtentVector allocated, tmp;
gen_type rng;
init_alloc(capacity, alloc_unit);
alloc->init_add_free(0, capacity);
EXPECT_EQ(0.0, alloc->get_fragmentation(alloc_unit));
uint64_t allocated_cnt = 0;
for (size_t round = 0; round < rounds ; round++) {
for (size_t j = 0; j < actions_per_round ; j++) {
//free or allocate ?
if ( rng() % capacity >= allocated_cnt ) {
//allocate
want_size = ( rng() % one_alloc_max ) / alloc_unit * alloc_unit + alloc_unit;
tmp.clear();
uint64_t r = alloc->allocate(want_size, alloc_unit, 0, 0, &tmp);
for (auto& t: tmp) {
if (t.length > 0)
allocated.push_back(t);
}
allocated_cnt += r;
} else {
//free
ceph_assert(allocated.size() > 0);
size_t item = rng() % allocated.size();
ceph_assert(allocated[item].length > 0);
allocated_cnt -= allocated[item].length;
interval_set<uint64_t> release_set;
release_set.insert(allocated[item].offset, allocated[item].length);
alloc->release(release_set);
std::swap(allocated[item], allocated[allocated.size() - 1]);
allocated.resize(allocated.size() - 1);
}
}
size_t free_sum = 0;
auto iterated_allocation = [&](size_t off, size_t len) {
ceph_assert(len > 0);
free_sum += len;
};
alloc->dump(iterated_allocation);
EXPECT_GT(1, alloc->get_fragmentation_score());
EXPECT_EQ(capacity, free_sum + allocated_cnt);
}
for (size_t i = 0; i < allocated.size(); i ++)
{
interval_set<uint64_t> release_set;
release_set.insert(allocated[i].offset, allocated[i].length);
alloc->release(release_set);
}
}
TEST_P(AllocTest, test_alloc_bug_24598)
{
if (string(GetParam()) != "bitmap")