mirror of
https://github.com/ceph/ceph
synced 2024-12-28 22:43:29 +00:00
Merge pull request #34588 from ifed01/wip-ifed-lower-spinner-mas
os/bluestore: set bluestore_min_alloc_size to 4K for spinners Reviewed-by: Josh Durgin <jdurgin@redhat.com> Reviewed-by: Mark Nelson <mnelson@redhat.com> Reviewed-by: Neha Ojha <nojha@redhat.com>
This commit is contained in:
commit
a8733598ed
@ -12,8 +12,8 @@ overrides:
|
||||
debug bluefs: 20
|
||||
debug rocksdb: 10
|
||||
bluestore fsck on mount: true
|
||||
bluestore allocator: avl
|
||||
bluefs allocator: avl
|
||||
bluestore allocator: hybrid
|
||||
bluefs allocator: hybrid
|
||||
# lower the full ratios since we can fill up a 100gb osd so quickly
|
||||
mon osd full ratio: .9
|
||||
mon osd backfillfull_ratio: .85
|
@ -4019,7 +4019,7 @@ std::vector<Option> get_global_options() {
|
||||
.set_description(""),
|
||||
|
||||
Option("bluefs_allocator", Option::TYPE_STR, Option::LEVEL_DEV)
|
||||
.set_default("bitmap")
|
||||
.set_default("hybrid")
|
||||
.set_enum_allowed({"bitmap", "stupid", "avl", "hybrid"})
|
||||
.set_description(""),
|
||||
|
||||
@ -4183,7 +4183,7 @@ std::vector<Option> get_global_options() {
|
||||
.set_long_description("A smaller allocation size generally means less data is read and then rewritten when a copy-on-write operation is triggered (e.g., when writing to something that was recently snapshotted). Similarly, less data is journaled before performing an overwrite (writes smaller than min_alloc_size must first pass through the BlueStore journal). Larger values of min_alloc_size reduce the amount of metadata required to describe the on-disk layout and reduce overall fragmentation."),
|
||||
|
||||
Option("bluestore_min_alloc_size_hdd", Option::TYPE_SIZE, Option::LEVEL_ADVANCED)
|
||||
.set_default(64_K)
|
||||
.set_default(4_K)
|
||||
.set_flag(Option::FLAG_CREATE)
|
||||
.set_description("Default min_alloc_size value for rotational media")
|
||||
.add_see_also("bluestore_min_alloc_size"),
|
||||
@ -4205,7 +4205,7 @@ std::vector<Option> get_global_options() {
|
||||
.set_description("Writes smaller than this size will be written to the journal and then asynchronously written to the device. This can be beneficial when using rotational media where seeks are expensive, and is helpful both with and without solid state journal/wal devices."),
|
||||
|
||||
Option("bluestore_prefer_deferred_size_hdd", Option::TYPE_SIZE, Option::LEVEL_ADVANCED)
|
||||
.set_default(65536)
|
||||
.set_default(64_K)
|
||||
.set_flag(Option::FLAG_RUNTIME)
|
||||
.set_description("Default bluestore_prefer_deferred_size for rotational media")
|
||||
.add_see_also("bluestore_prefer_deferred_size"),
|
||||
@ -4237,7 +4237,7 @@ std::vector<Option> get_global_options() {
|
||||
.set_long_description("Chunks larger than this are broken into smaller chunks before being compressed"),
|
||||
|
||||
Option("bluestore_compression_min_blob_size_hdd", Option::TYPE_SIZE, Option::LEVEL_ADVANCED)
|
||||
.set_default(128_K)
|
||||
.set_default(8_K)
|
||||
.set_flag(Option::FLAG_RUNTIME)
|
||||
.set_description("Default value of bluestore_compression_min_blob_size for rotational media")
|
||||
.add_see_also("bluestore_compression_min_blob_size"),
|
||||
@ -4255,7 +4255,7 @@ std::vector<Option> get_global_options() {
|
||||
.set_long_description("Chunks larger than this are broken into smaller chunks before being compressed"),
|
||||
|
||||
Option("bluestore_compression_max_blob_size_hdd", Option::TYPE_SIZE, Option::LEVEL_ADVANCED)
|
||||
.set_default(512_K)
|
||||
.set_default(64_K)
|
||||
.set_flag(Option::FLAG_RUNTIME)
|
||||
.set_description("Default value of bluestore_compression_max_blob_size for rotational media")
|
||||
.add_see_also("bluestore_compression_max_blob_size"),
|
||||
@ -4283,7 +4283,7 @@ std::vector<Option> get_global_options() {
|
||||
.set_long_description("Bluestore blobs are collections of extents (ie on-disk data) originating from one or more objects. Blobs can be compressed, typically have checksum data, may be overwritten, may be shared (with an extent ref map), or split. This setting controls the maximum size a blob is allowed to be."),
|
||||
|
||||
Option("bluestore_max_blob_size_hdd", Option::TYPE_SIZE, Option::LEVEL_DEV)
|
||||
.set_default(512_K)
|
||||
.set_default(64_K)
|
||||
.set_flag(Option::FLAG_RUNTIME)
|
||||
.set_description("")
|
||||
.add_see_also("bluestore_max_blob_size"),
|
||||
@ -4387,7 +4387,7 @@ std::vector<Option> get_global_options() {
|
||||
.set_description("Key value database to use for bluestore"),
|
||||
|
||||
Option("bluestore_allocator", Option::TYPE_STR, Option::LEVEL_ADVANCED)
|
||||
.set_default("bitmap")
|
||||
.set_default("hybrid")
|
||||
.set_enum_allowed({"bitmap", "stupid", "avl", "hybrid"})
|
||||
.set_description("Allocator policy")
|
||||
.set_long_description("Allocator to use for bluestore. Stupid should only be used for testing."),
|
||||
|
@ -1318,6 +1318,8 @@ TEST_P(StoreTest, SimpleObjectTest) {
|
||||
TEST_P(StoreTestSpecificAUSize, ReproBug41901Test) {
|
||||
if(string(GetParam()) != "bluestore")
|
||||
return;
|
||||
|
||||
SetVal(g_conf(), "bluestore_max_blob_size", "524288");
|
||||
SetVal(g_conf(), "bluestore_debug_enforce_settings", "hdd");
|
||||
g_conf().apply_changes(nullptr);
|
||||
StartDeferred(65536);
|
||||
@ -3768,6 +3770,7 @@ public:
|
||||
unsigned in_flight;
|
||||
map<ghobject_t, Object> contents;
|
||||
set<ghobject_t> available_objects;
|
||||
set<ghobject_t>::iterator next_available_object;
|
||||
set<ghobject_t> in_flight_objects;
|
||||
ObjectGenerator *object_gen;
|
||||
gen_type *rng;
|
||||
@ -3892,8 +3895,9 @@ public:
|
||||
unsigned max_write,
|
||||
unsigned alignment)
|
||||
: cid(cid), write_alignment(alignment), max_object_len(max_size),
|
||||
max_write_len(max_write), in_flight(0), object_gen(gen),
|
||||
rng(rng), store(store) {}
|
||||
max_write_len(max_write), in_flight(0),
|
||||
next_available_object(available_objects.end()),
|
||||
object_gen(gen), rng(rng), store(store) {}
|
||||
|
||||
int init() {
|
||||
ObjectStore::Transaction t;
|
||||
@ -3902,17 +3906,19 @@ public:
|
||||
return queue_transaction(store, ch, std::move(t));
|
||||
}
|
||||
void shutdown() {
|
||||
ghobject_t next;
|
||||
while (1) {
|
||||
vector<ghobject_t> objects;
|
||||
int r = store->collection_list(ch, ghobject_t(), ghobject_t::get_max(),
|
||||
10, &objects, 0);
|
||||
int r = store->collection_list(ch, next, ghobject_t::get_max(),
|
||||
10, &objects, &next);
|
||||
ceph_assert(r >= 0);
|
||||
if (objects.empty())
|
||||
break;
|
||||
if (objects.size() == 0)
|
||||
break;
|
||||
ObjectStore::Transaction t;
|
||||
std::map<std::string, ceph::buffer::list> attrset;
|
||||
for (vector<ghobject_t>::iterator p = objects.begin();
|
||||
p != objects.end(); ++p) {
|
||||
t.remove(cid, *p);
|
||||
p != objects.end(); ++p) {
|
||||
t.remove(cid, *p);
|
||||
}
|
||||
queue_transaction(store, ch, std::move(t));
|
||||
}
|
||||
@ -3936,6 +3942,20 @@ public:
|
||||
return ret;
|
||||
}
|
||||
|
||||
ghobject_t get_next_object(std::unique_lock<ceph::mutex>& locker) {
|
||||
cond.wait(locker, [this] {
|
||||
return in_flight < max_in_flight && !available_objects.empty();
|
||||
});
|
||||
|
||||
if (next_available_object == available_objects.end()) {
|
||||
next_available_object = available_objects.begin();
|
||||
}
|
||||
|
||||
ghobject_t ret = *next_available_object;
|
||||
++next_available_object;
|
||||
return ret;
|
||||
}
|
||||
|
||||
void wait_for_ready(std::unique_lock<ceph::mutex>& locker) {
|
||||
cond.wait(locker, [this] { return in_flight < max_in_flight; });
|
||||
}
|
||||
@ -4392,6 +4412,35 @@ public:
|
||||
return status;
|
||||
}
|
||||
|
||||
int set_fixed_attrs(size_t entries, size_t key_size, size_t val_size) {
|
||||
std::unique_lock locker{ lock };
|
||||
EnterExit ee("setattrs");
|
||||
if (!can_unlink())
|
||||
return -ENOENT;
|
||||
wait_for_ready(locker);
|
||||
|
||||
ghobject_t obj = get_next_object(locker);
|
||||
available_objects.erase(obj);
|
||||
ObjectStore::Transaction t;
|
||||
|
||||
map<string, bufferlist> attrs;
|
||||
set<string> keys;
|
||||
|
||||
while (entries--) {
|
||||
bufferlist name, value;
|
||||
filled_byte_array(value, val_size);
|
||||
filled_byte_array(name, key_size);
|
||||
attrs[name.c_str()] = value;
|
||||
contents[obj].attrs[name.c_str()] = value;
|
||||
}
|
||||
t.setattrs(cid, obj, attrs);
|
||||
++in_flight;
|
||||
in_flight_objects.insert(obj);
|
||||
t.register_on_applied(new C_SyntheticOnReadable(this, obj));
|
||||
int status = store->queue_transaction(ch, std::move(t));
|
||||
return status;
|
||||
}
|
||||
|
||||
void getattrs() {
|
||||
EnterExit ee("getattrs");
|
||||
ghobject_t obj;
|
||||
@ -6680,8 +6729,6 @@ TEST_P(StoreTestSpecificAUSize, DeferredOnBigOverwrite) {
|
||||
return;
|
||||
|
||||
size_t block_size = 4096;
|
||||
// this will enable continuous allocations
|
||||
SetVal(g_conf(), "bluestore_allocator", "avl");
|
||||
StartDeferred(block_size);
|
||||
SetVal(g_conf(), "bluestore_max_blob_size", "131072");
|
||||
SetVal(g_conf(), "bluestore_prefer_deferred_size", "65536");
|
||||
@ -7117,8 +7164,6 @@ TEST_P(StoreTestSpecificAUSize, DeferredDifferentChunks) {
|
||||
|
||||
size_t alloc_size = 4096;
|
||||
size_t large_object_size = 1 * 1024 * 1024;
|
||||
// this will enable continuous allocations
|
||||
SetVal(g_conf(), "bluestore_allocator", "avl");
|
||||
StartDeferred(alloc_size);
|
||||
SetVal(g_conf(), "bluestore_max_blob_size", "131072");
|
||||
SetVal(g_conf(), "bluestore_prefer_deferred_size", "65536");
|
||||
@ -8709,22 +8754,23 @@ void doManySetAttr(ObjectStore* store,
|
||||
gen_type rng(time(NULL));
|
||||
coll_t cid(spg_t(pg_t(0, 447), shard_id_t::NO_SHARD));
|
||||
|
||||
SyntheticWorkloadState test_obj(store, &gen, &rng, cid, 40 * 1024, 4 * 1024, 0);
|
||||
SyntheticWorkloadState test_obj(store, &gen, &rng, cid, 0, 0, 0);
|
||||
test_obj.init();
|
||||
for (int i = 0; i < 1500; ++i) {
|
||||
size_t object_count = 256;
|
||||
for (size_t i = 0; i < object_count; ++i) {
|
||||
if (!(i % 10)) cerr << "seeding object " << i << std::endl;
|
||||
test_obj.touch();
|
||||
}
|
||||
for (int i = 0; i < 10000; ++i) {
|
||||
for (size_t i = 0; i < object_count; ++i) {
|
||||
if (!(i % 100)) {
|
||||
cerr << "Op " << i << std::endl;
|
||||
test_obj.print_internal_state();
|
||||
}
|
||||
boost::uniform_int<> true_false(0, 99);
|
||||
test_obj.setattrs();
|
||||
test_obj.set_fixed_attrs(1024, 64, 4096); // 1024 attributes, 64 bytes name and 4K value
|
||||
}
|
||||
test_obj.wait_for_done();
|
||||
|
||||
std::cout << "done" << std::endl;
|
||||
AdminSocket* admin_socket = g_ceph_context->get_admin_socket();
|
||||
ceph_assert(admin_socket);
|
||||
|
||||
@ -8761,7 +8807,9 @@ TEST_P(StoreTestSpecificAUSize, SpilloverTest) {
|
||||
const PerfCounters* logger = bstore->get_bluefs_perf_counters();
|
||||
//experimentally it was discovered that this case results in 400+MB spillover
|
||||
//using lower 300MB threshold just to be safe enough
|
||||
ASSERT_GE(logger->get(l_bluefs_slow_used_bytes), 300 * 1024 * 1024);
|
||||
std::cout << "db_used:" << logger->get(l_bluefs_db_used_bytes) << std::endl;
|
||||
std::cout << "slow_used:" << logger->get(l_bluefs_slow_used_bytes) << std::endl;
|
||||
ASSERT_GE(logger->get(l_bluefs_slow_used_bytes), 16 * 1024 * 1024);
|
||||
|
||||
}
|
||||
);
|
||||
|
@ -82,6 +82,7 @@ TEST(BlueFS, mkfs_mount) {
|
||||
TEST(BlueFS, mkfs_mount_duplicate_gift) {
|
||||
uint64_t size = 1048576 * 128;
|
||||
TempBdev bdev{ size };
|
||||
bluefs_extent_t dup_ext;
|
||||
{
|
||||
BlueFS fs(g_ceph_context);
|
||||
ASSERT_EQ(0, fs.add_block_device(BlueFS::BDEV_DB, bdev.path, false));
|
||||
@ -98,6 +99,9 @@ TEST(BlueFS, mkfs_mount_duplicate_gift) {
|
||||
h->append("bar", 3);
|
||||
h->append("baz", 3);
|
||||
fs.fsync(h);
|
||||
ceph_assert(h->file->fnode.extents.size() > 0);
|
||||
dup_ext = h->file->fnode.extents[0];
|
||||
ceph_assert(dup_ext.bdev == BlueFS::BDEV_DB);
|
||||
fs.close_writer(h);
|
||||
}
|
||||
|
||||
@ -109,7 +113,10 @@ TEST(BlueFS, mkfs_mount_duplicate_gift) {
|
||||
ASSERT_EQ(0, fs.add_block_device(BlueFS::BDEV_DB, bdev.path, false));
|
||||
ASSERT_EQ(0, fs.mount());
|
||||
// free allocation presumably allocated for file1
|
||||
fs.debug_inject_duplicate_gift(BlueFS::BDEV_DB, 5 * 1048576, 1048576);
|
||||
std::cout << "duplicate extent: " << std::hex
|
||||
<< dup_ext.offset << "~" << dup_ext.length
|
||||
<< std::dec << std::endl;
|
||||
fs.debug_inject_duplicate_gift(BlueFS::BDEV_DB, dup_ext.offset, dup_ext.length);
|
||||
{
|
||||
// overwrite file1 with file2
|
||||
BlueFS::FileWriter *h;
|
||||
|
Loading…
Reference in New Issue
Block a user