Merge pull request #34588 from ifed01/wip-ifed-lower-spinner-mas

os/bluestore:  set bluestore_min_alloc_size to 4K for spinners

Reviewed-by: Josh Durgin <jdurgin@redhat.com>
Reviewed-by: Mark Nelson <mnelson@redhat.com>
Reviewed-by: Neha Ojha <nojha@redhat.com>
This commit is contained in:
Kefu Chai 2020-04-27 11:34:14 +08:00 committed by GitHub
commit a8733598ed
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 83 additions and 28 deletions

View File

@ -12,8 +12,8 @@ overrides:
debug bluefs: 20
debug rocksdb: 10
bluestore fsck on mount: true
bluestore allocator: avl
bluefs allocator: avl
bluestore allocator: hybrid
bluefs allocator: hybrid
# lower the full ratios since we can fill up a 100gb osd so quickly
mon osd full ratio: .9
mon osd backfillfull_ratio: .85

View File

@ -4019,7 +4019,7 @@ std::vector<Option> get_global_options() {
.set_description(""),
Option("bluefs_allocator", Option::TYPE_STR, Option::LEVEL_DEV)
.set_default("bitmap")
.set_default("hybrid")
.set_enum_allowed({"bitmap", "stupid", "avl", "hybrid"})
.set_description(""),
@ -4183,7 +4183,7 @@ std::vector<Option> get_global_options() {
.set_long_description("A smaller allocation size generally means less data is read and then rewritten when a copy-on-write operation is triggered (e.g., when writing to something that was recently snapshotted). Similarly, less data is journaled before performing an overwrite (writes smaller than min_alloc_size must first pass through the BlueStore journal). Larger values of min_alloc_size reduce the amount of metadata required to describe the on-disk layout and reduce overall fragmentation."),
Option("bluestore_min_alloc_size_hdd", Option::TYPE_SIZE, Option::LEVEL_ADVANCED)
.set_default(64_K)
.set_default(4_K)
.set_flag(Option::FLAG_CREATE)
.set_description("Default min_alloc_size value for rotational media")
.add_see_also("bluestore_min_alloc_size"),
@ -4205,7 +4205,7 @@ std::vector<Option> get_global_options() {
.set_description("Writes smaller than this size will be written to the journal and then asynchronously written to the device. This can be beneficial when using rotational media where seeks are expensive, and is helpful both with and without solid state journal/wal devices."),
Option("bluestore_prefer_deferred_size_hdd", Option::TYPE_SIZE, Option::LEVEL_ADVANCED)
.set_default(65536)
.set_default(64_K)
.set_flag(Option::FLAG_RUNTIME)
.set_description("Default bluestore_prefer_deferred_size for rotational media")
.add_see_also("bluestore_prefer_deferred_size"),
@ -4237,7 +4237,7 @@ std::vector<Option> get_global_options() {
.set_long_description("Chunks larger than this are broken into smaller chunks before being compressed"),
Option("bluestore_compression_min_blob_size_hdd", Option::TYPE_SIZE, Option::LEVEL_ADVANCED)
.set_default(128_K)
.set_default(8_K)
.set_flag(Option::FLAG_RUNTIME)
.set_description("Default value of bluestore_compression_min_blob_size for rotational media")
.add_see_also("bluestore_compression_min_blob_size"),
@ -4255,7 +4255,7 @@ std::vector<Option> get_global_options() {
.set_long_description("Chunks larger than this are broken into smaller chunks before being compressed"),
Option("bluestore_compression_max_blob_size_hdd", Option::TYPE_SIZE, Option::LEVEL_ADVANCED)
.set_default(512_K)
.set_default(64_K)
.set_flag(Option::FLAG_RUNTIME)
.set_description("Default value of bluestore_compression_max_blob_size for rotational media")
.add_see_also("bluestore_compression_max_blob_size"),
@ -4283,7 +4283,7 @@ std::vector<Option> get_global_options() {
.set_long_description("Bluestore blobs are collections of extents (ie on-disk data) originating from one or more objects. Blobs can be compressed, typically have checksum data, may be overwritten, may be shared (with an extent ref map), or split. This setting controls the maximum size a blob is allowed to be."),
Option("bluestore_max_blob_size_hdd", Option::TYPE_SIZE, Option::LEVEL_DEV)
.set_default(512_K)
.set_default(64_K)
.set_flag(Option::FLAG_RUNTIME)
.set_description("")
.add_see_also("bluestore_max_blob_size"),
@ -4387,7 +4387,7 @@ std::vector<Option> get_global_options() {
.set_description("Key value database to use for bluestore"),
Option("bluestore_allocator", Option::TYPE_STR, Option::LEVEL_ADVANCED)
.set_default("bitmap")
.set_default("hybrid")
.set_enum_allowed({"bitmap", "stupid", "avl", "hybrid"})
.set_description("Allocator policy")
.set_long_description("Allocator to use for bluestore. Stupid should only be used for testing."),

View File

@ -1318,6 +1318,8 @@ TEST_P(StoreTest, SimpleObjectTest) {
TEST_P(StoreTestSpecificAUSize, ReproBug41901Test) {
if(string(GetParam()) != "bluestore")
return;
SetVal(g_conf(), "bluestore_max_blob_size", "524288");
SetVal(g_conf(), "bluestore_debug_enforce_settings", "hdd");
g_conf().apply_changes(nullptr);
StartDeferred(65536);
@ -3768,6 +3770,7 @@ public:
unsigned in_flight;
map<ghobject_t, Object> contents;
set<ghobject_t> available_objects;
set<ghobject_t>::iterator next_available_object;
set<ghobject_t> in_flight_objects;
ObjectGenerator *object_gen;
gen_type *rng;
@ -3892,8 +3895,9 @@ public:
unsigned max_write,
unsigned alignment)
: cid(cid), write_alignment(alignment), max_object_len(max_size),
max_write_len(max_write), in_flight(0), object_gen(gen),
rng(rng), store(store) {}
max_write_len(max_write), in_flight(0),
next_available_object(available_objects.end()),
object_gen(gen), rng(rng), store(store) {}
int init() {
ObjectStore::Transaction t;
@ -3902,17 +3906,19 @@ public:
return queue_transaction(store, ch, std::move(t));
}
void shutdown() {
ghobject_t next;
while (1) {
vector<ghobject_t> objects;
int r = store->collection_list(ch, ghobject_t(), ghobject_t::get_max(),
10, &objects, 0);
int r = store->collection_list(ch, next, ghobject_t::get_max(),
10, &objects, &next);
ceph_assert(r >= 0);
if (objects.empty())
break;
if (objects.size() == 0)
break;
ObjectStore::Transaction t;
std::map<std::string, ceph::buffer::list> attrset;
for (vector<ghobject_t>::iterator p = objects.begin();
p != objects.end(); ++p) {
t.remove(cid, *p);
p != objects.end(); ++p) {
t.remove(cid, *p);
}
queue_transaction(store, ch, std::move(t));
}
@ -3936,6 +3942,20 @@ public:
return ret;
}
ghobject_t get_next_object(std::unique_lock<ceph::mutex>& locker) {
cond.wait(locker, [this] {
return in_flight < max_in_flight && !available_objects.empty();
});
if (next_available_object == available_objects.end()) {
next_available_object = available_objects.begin();
}
ghobject_t ret = *next_available_object;
++next_available_object;
return ret;
}
void wait_for_ready(std::unique_lock<ceph::mutex>& locker) {
cond.wait(locker, [this] { return in_flight < max_in_flight; });
}
@ -4392,6 +4412,35 @@ public:
return status;
}
int set_fixed_attrs(size_t entries, size_t key_size, size_t val_size) {
std::unique_lock locker{ lock };
EnterExit ee("setattrs");
if (!can_unlink())
return -ENOENT;
wait_for_ready(locker);
ghobject_t obj = get_next_object(locker);
available_objects.erase(obj);
ObjectStore::Transaction t;
map<string, bufferlist> attrs;
set<string> keys;
while (entries--) {
bufferlist name, value;
filled_byte_array(value, val_size);
filled_byte_array(name, key_size);
attrs[name.c_str()] = value;
contents[obj].attrs[name.c_str()] = value;
}
t.setattrs(cid, obj, attrs);
++in_flight;
in_flight_objects.insert(obj);
t.register_on_applied(new C_SyntheticOnReadable(this, obj));
int status = store->queue_transaction(ch, std::move(t));
return status;
}
void getattrs() {
EnterExit ee("getattrs");
ghobject_t obj;
@ -6680,8 +6729,6 @@ TEST_P(StoreTestSpecificAUSize, DeferredOnBigOverwrite) {
return;
size_t block_size = 4096;
// this will enable continuous allocations
SetVal(g_conf(), "bluestore_allocator", "avl");
StartDeferred(block_size);
SetVal(g_conf(), "bluestore_max_blob_size", "131072");
SetVal(g_conf(), "bluestore_prefer_deferred_size", "65536");
@ -7117,8 +7164,6 @@ TEST_P(StoreTestSpecificAUSize, DeferredDifferentChunks) {
size_t alloc_size = 4096;
size_t large_object_size = 1 * 1024 * 1024;
// this will enable continuous allocations
SetVal(g_conf(), "bluestore_allocator", "avl");
StartDeferred(alloc_size);
SetVal(g_conf(), "bluestore_max_blob_size", "131072");
SetVal(g_conf(), "bluestore_prefer_deferred_size", "65536");
@ -8709,22 +8754,23 @@ void doManySetAttr(ObjectStore* store,
gen_type rng(time(NULL));
coll_t cid(spg_t(pg_t(0, 447), shard_id_t::NO_SHARD));
SyntheticWorkloadState test_obj(store, &gen, &rng, cid, 40 * 1024, 4 * 1024, 0);
SyntheticWorkloadState test_obj(store, &gen, &rng, cid, 0, 0, 0);
test_obj.init();
for (int i = 0; i < 1500; ++i) {
size_t object_count = 256;
for (size_t i = 0; i < object_count; ++i) {
if (!(i % 10)) cerr << "seeding object " << i << std::endl;
test_obj.touch();
}
for (int i = 0; i < 10000; ++i) {
for (size_t i = 0; i < object_count; ++i) {
if (!(i % 100)) {
cerr << "Op " << i << std::endl;
test_obj.print_internal_state();
}
boost::uniform_int<> true_false(0, 99);
test_obj.setattrs();
test_obj.set_fixed_attrs(1024, 64, 4096); // 1024 attributes, 64 bytes name and 4K value
}
test_obj.wait_for_done();
std::cout << "done" << std::endl;
AdminSocket* admin_socket = g_ceph_context->get_admin_socket();
ceph_assert(admin_socket);
@ -8761,7 +8807,9 @@ TEST_P(StoreTestSpecificAUSize, SpilloverTest) {
const PerfCounters* logger = bstore->get_bluefs_perf_counters();
//experimentally it was discovered that this case results in 400+MB spillover
//using lower 300MB threshold just to be safe enough
ASSERT_GE(logger->get(l_bluefs_slow_used_bytes), 300 * 1024 * 1024);
std::cout << "db_used:" << logger->get(l_bluefs_db_used_bytes) << std::endl;
std::cout << "slow_used:" << logger->get(l_bluefs_slow_used_bytes) << std::endl;
ASSERT_GE(logger->get(l_bluefs_slow_used_bytes), 16 * 1024 * 1024);
}
);

View File

@ -82,6 +82,7 @@ TEST(BlueFS, mkfs_mount) {
TEST(BlueFS, mkfs_mount_duplicate_gift) {
uint64_t size = 1048576 * 128;
TempBdev bdev{ size };
bluefs_extent_t dup_ext;
{
BlueFS fs(g_ceph_context);
ASSERT_EQ(0, fs.add_block_device(BlueFS::BDEV_DB, bdev.path, false));
@ -98,6 +99,9 @@ TEST(BlueFS, mkfs_mount_duplicate_gift) {
h->append("bar", 3);
h->append("baz", 3);
fs.fsync(h);
ceph_assert(h->file->fnode.extents.size() > 0);
dup_ext = h->file->fnode.extents[0];
ceph_assert(dup_ext.bdev == BlueFS::BDEV_DB);
fs.close_writer(h);
}
@ -109,7 +113,10 @@ TEST(BlueFS, mkfs_mount_duplicate_gift) {
ASSERT_EQ(0, fs.add_block_device(BlueFS::BDEV_DB, bdev.path, false));
ASSERT_EQ(0, fs.mount());
// free allocation presumably allocated for file1
fs.debug_inject_duplicate_gift(BlueFS::BDEV_DB, 5 * 1048576, 1048576);
std::cout << "duplicate extent: " << std::hex
<< dup_ext.offset << "~" << dup_ext.length
<< std::dec << std::endl;
fs.debug_inject_duplicate_gift(BlueFS::BDEV_DB, dup_ext.offset, dup_ext.length);
{
// overwrite file1 with file2
BlueFS::FileWriter *h;