From 5b47ac59816894e983a98d3da8b5415d569c6663 Mon Sep 17 00:00:00 2001 From: Sage Weil <sage@redhat.com> Date: Fri, 8 Sep 2017 18:05:29 -0400 Subject: [PATCH 1/5] os/bluestore: require that bluefs_alloc_size be multiple of min_alloc_size Signed-off-by: Sage Weil <sage@redhat.com> --- src/os/bluestore/BlueStore.cc | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/src/os/bluestore/BlueStore.cc b/src/os/bluestore/BlueStore.cc index 868c0f6a4db..9c2a8c72fe7 100644 --- a/src/os/bluestore/BlueStore.cc +++ b/src/os/bluestore/BlueStore.cc @@ -4555,6 +4555,13 @@ int BlueStore::_open_db(bool create) bdev->get_size() * (cct->_conf->bluestore_bluefs_min_ratio + cct->_conf->bluestore_bluefs_gift_ratio); initial = MAX(initial, cct->_conf->bluestore_bluefs_min); + if (cct->_conf->bluefs_alloc_size % min_alloc_size) { + derr << __func__ << " bluefs_alloc_size 0x" << std::hex + << cct->_conf->bluefs_alloc_size << " is not a multiple of " + << "min_alloc_size 0x" << min_alloc_size << std::dec << dendl; + r = -EINVAL; + goto free_bluefs; + } // align to bluefs's alloc_size initial = P2ROUNDUP(initial, cct->_conf->bluefs_alloc_size); // put bluefs in the middle of the device in case it is an HDD From 3efde01586776b23bbac1e663ae5baf6500acee4 Mon Sep 17 00:00:00 2001 From: Sage Weil <sage@redhat.com> Date: Fri, 8 Sep 2017 18:06:05 -0400 Subject: [PATCH 2/5] os/bluestore: mkfs: choose min_alloc_size earlier Signed-off-by: Sage Weil <sage@redhat.com> --- src/os/bluestore/BlueStore.cc | 44 +++++++++++++++++------------------ 1 file changed, 22 insertions(+), 22 deletions(-) diff --git a/src/os/bluestore/BlueStore.cc b/src/os/bluestore/BlueStore.cc index 9c2a8c72fe7..0a73f61f84e 100644 --- a/src/os/bluestore/BlueStore.cc +++ b/src/os/bluestore/BlueStore.cc @@ -5167,6 +5167,28 @@ int BlueStore::mkfs() if (r < 0) goto out_close_fsid; + // choose min_alloc_size + if (cct->_conf->bluestore_min_alloc_size) { + min_alloc_size = cct->_conf->bluestore_min_alloc_size; + } else { + assert(bdev); + if (bdev->is_rotational()) { + min_alloc_size = cct->_conf->bluestore_min_alloc_size_hdd; + } else { + min_alloc_size = cct->_conf->bluestore_min_alloc_size_ssd; + } + } + + // make sure min_alloc_size is power of 2 aligned. + if (!ISP2(min_alloc_size)) { + derr << __func__ << " min_alloc_size 0x" + << std::hex << min_alloc_size << std::dec + << " is not power of 2 aligned!" + << dendl; + r = -EINVAL; + goto out_close_bdev; + } + r = _open_db(true); if (r < 0) goto out_close_bdev; @@ -5184,28 +5206,6 @@ int BlueStore::mkfs() t->set(PREFIX_SUPER, "blobid_max", bl); } - // choose min_alloc_size - if (cct->_conf->bluestore_min_alloc_size) { - min_alloc_size = cct->_conf->bluestore_min_alloc_size; - } else { - assert(bdev); - if (bdev->is_rotational()) { - min_alloc_size = cct->_conf->bluestore_min_alloc_size_hdd; - } else { - min_alloc_size = cct->_conf->bluestore_min_alloc_size_ssd; - } - } - - // make sure min_alloc_size is power of 2 aligned. - if (!ISP2(min_alloc_size)) { - derr << __func__ << " min_alloc_size 0x" - << std::hex << min_alloc_size << std::dec - << " is not power of 2 aligned!" - << dendl; - r = -EINVAL; - goto out_close_fm; - } - { bufferlist bl; ::encode((uint64_t)min_alloc_size, bl); From 52453d4ca223c8819f8e35f2c0b691803e74537f Mon Sep 17 00:00:00 2001 From: Sage Weil <sage@redhat.com> Date: Fri, 8 Sep 2017 18:07:38 -0400 Subject: [PATCH 3/5] os/bluestore/FreelistManager: create: accept min alloc size Accept a block size other than bdev_block_size. Let's call it, oh, I don't know, min_alloc_size. Signed-off-by: Sage Weil <sage@redhat.com> --- src/os/bluestore/BitmapFreelistManager.cc | 6 ++++-- src/os/bluestore/BitmapFreelistManager.h | 3 ++- src/os/bluestore/BlueStore.cc | 2 +- src/os/bluestore/FreelistManager.h | 3 ++- 4 files changed, 9 insertions(+), 5 deletions(-) diff --git a/src/os/bluestore/BitmapFreelistManager.cc b/src/os/bluestore/BitmapFreelistManager.cc index 075d3eff6ef..e551acc5287 100644 --- a/src/os/bluestore/BitmapFreelistManager.cc +++ b/src/os/bluestore/BitmapFreelistManager.cc @@ -58,9 +58,11 @@ BitmapFreelistManager::BitmapFreelistManager(CephContext* cct, { } -int BitmapFreelistManager::create(uint64_t new_size, KeyValueDB::Transaction txn) +int BitmapFreelistManager::create(uint64_t new_size, uint64_t min_alloc_size, + KeyValueDB::Transaction txn) { - bytes_per_block = cct->_conf->bdev_block_size; + bytes_per_block = std::max(cct->_conf->bdev_block_size, + (int64_t)min_alloc_size); assert(ISP2(bytes_per_block)); size = P2ALIGN(new_size, bytes_per_block); blocks_per_key = cct->_conf->bluestore_freelist_blocks_per_key; diff --git a/src/os/bluestore/BitmapFreelistManager.h b/src/os/bluestore/BitmapFreelistManager.h index 9ed39ff5653..cb10c63d98a 100644 --- a/src/os/bluestore/BitmapFreelistManager.h +++ b/src/os/bluestore/BitmapFreelistManager.h @@ -51,7 +51,8 @@ public: static void setup_merge_operator(KeyValueDB *db, string prefix); - int create(uint64_t size, KeyValueDB::Transaction txn) override; + int create(uint64_t size, uint64_t min_alloc_size, + KeyValueDB::Transaction txn) override; int init() override; void shutdown() override; diff --git a/src/os/bluestore/BlueStore.cc b/src/os/bluestore/BlueStore.cc index 0a73f61f84e..3a53d038f10 100644 --- a/src/os/bluestore/BlueStore.cc +++ b/src/os/bluestore/BlueStore.cc @@ -4169,7 +4169,7 @@ int BlueStore::_open_fm(bool create) bl.append(freelist_type); t->set(PREFIX_SUPER, "freelist_type", bl); } - fm->create(bdev->get_size(), t); + fm->create(bdev->get_size(), cct->_conf->bdev_block_size, t); // allocate superblock reserved space. note that we do not mark // bluefs space as allocated in the freelist; we instead rely on diff --git a/src/os/bluestore/FreelistManager.h b/src/os/bluestore/FreelistManager.h index 7f5ad4d79f9..8f7aacbf2e1 100644 --- a/src/os/bluestore/FreelistManager.h +++ b/src/os/bluestore/FreelistManager.h @@ -24,7 +24,8 @@ public: static void setup_merge_operators(KeyValueDB *db); - virtual int create(uint64_t size, KeyValueDB::Transaction txn) = 0; + virtual int create(uint64_t size, uint64_t min_alloc_size, + KeyValueDB::Transaction txn) = 0; virtual int init() = 0; virtual void shutdown() = 0; From 0c777efdcb2ee5a6322f0eb277e681d0f086e0b6 Mon Sep 17 00:00:00 2001 From: Sage Weil <sage@redhat.com> Date: Fri, 8 Sep 2017 18:08:07 -0400 Subject: [PATCH 4/5] os/bluestore: align bluefs_extents to min_alloc_size Signed-off-by: Sage Weil <sage@redhat.com> --- src/os/bluestore/BlueStore.cc | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/src/os/bluestore/BlueStore.cc b/src/os/bluestore/BlueStore.cc index 3a53d038f10..1e24c7b521b 100644 --- a/src/os/bluestore/BlueStore.cc +++ b/src/os/bluestore/BlueStore.cc @@ -4174,13 +4174,14 @@ int BlueStore::_open_fm(bool create) // allocate superblock reserved space. note that we do not mark // bluefs space as allocated in the freelist; we instead rely on // bluefs_extents. - fm->allocate(0, SUPER_RESERVED, t); + uint64_t reserved = ROUND_UP_TO(MAX(SUPER_RESERVED, min_alloc_size), + min_alloc_size); + fm->allocate(0, reserved, t); - uint64_t reserved = 0; if (cct->_conf->bluestore_bluefs) { assert(bluefs_extents.num_intervals() == 1); interval_set<uint64_t>::iterator p = bluefs_extents.begin(); - reserved = p.get_start() + p.get_len(); + reserved = ROUND_UP_TO(p.get_start() + p.get_len(), min_alloc_size); dout(20) << __func__ << " reserved 0x" << std::hex << reserved << std::dec << " for bluefs" << dendl; bufferlist bl; @@ -4188,8 +4189,6 @@ int BlueStore::_open_fm(bool create) t->set(PREFIX_SUPER, "bluefs_extents", bl); dout(20) << __func__ << " bluefs_extents 0x" << std::hex << bluefs_extents << std::dec << dendl; - } else { - reserved = SUPER_RESERVED; } if (cct->_conf->bluestore_debug_prefill > 0) { From 6b8e4d512604095fb8a209229d4633ac19b499de Mon Sep 17 00:00:00 2001 From: Sage Weil <sage@redhat.com> Date: Fri, 8 Sep 2017 18:08:51 -0400 Subject: [PATCH 5/5] os/bluestore: use min_alloc_size for freelist resolution For HDD with min_alloc_size=64k, this is a 16x reduction in allocation metadata! Signed-off-by: Sage Weil <sage@redhat.com> --- src/os/bluestore/BlueStore.cc | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/src/os/bluestore/BlueStore.cc b/src/os/bluestore/BlueStore.cc index 1e24c7b521b..7b9cfaab0b2 100644 --- a/src/os/bluestore/BlueStore.cc +++ b/src/os/bluestore/BlueStore.cc @@ -4169,7 +4169,7 @@ int BlueStore::_open_fm(bool create) bl.append(freelist_type); t->set(PREFIX_SUPER, "freelist_type", bl); } - fm->create(bdev->get_size(), cct->_conf->bdev_block_size, t); + fm->create(bdev->get_size(), min_alloc_size, t); // allocate superblock reserved space. note that we do not mark // bluefs space as allocated in the freelist; we instead rely on @@ -5460,7 +5460,7 @@ int BlueStore::_fsck_check_extents( } bool already = false; apply( - e.offset, e.length, block_size, used_blocks, + e.offset, e.length, min_alloc_size, used_blocks, [&](uint64_t pos, mempool_dynamic_bitset &bs) { if (bs.test(pos)) already = true; @@ -5562,9 +5562,9 @@ int BlueStore::fsck(bool deep) if (r < 0) goto out_scan; - used_blocks.resize(bdev->get_size() / block_size); + used_blocks.resize(bdev->get_size() / min_alloc_size); apply( - 0, SUPER_RESERVED, block_size, used_blocks, + 0, MAX(min_alloc_size, SUPER_RESERVED), min_alloc_size, used_blocks, [&](uint64_t pos, mempool_dynamic_bitset &bs) { bs.set(pos); } @@ -5573,7 +5573,7 @@ int BlueStore::fsck(bool deep) if (bluefs) { for (auto e = bluefs_extents.begin(); e != bluefs_extents.end(); ++e) { apply( - e.get_start(), e.get_len(), block_size, used_blocks, + e.get_start(), e.get_len(), min_alloc_size, used_blocks, [&](uint64_t pos, mempool_dynamic_bitset &bs) { bs.set(pos); } @@ -5972,7 +5972,7 @@ int BlueStore::fsck(bool deep) << " released 0x" << std::hex << wt.released << std::dec << dendl; for (auto e = wt.released.begin(); e != wt.released.end(); ++e) { apply( - e.get_start(), e.get_len(), block_size, used_blocks, + e.get_start(), e.get_len(), min_alloc_size, used_blocks, [&](uint64_t pos, mempool_dynamic_bitset &bs) { bs.set(pos); } @@ -5987,7 +5987,7 @@ int BlueStore::fsck(bool deep) // know they are allocated. for (auto e = bluefs_extents.begin(); e != bluefs_extents.end(); ++e) { apply( - e.get_start(), e.get_len(), block_size, used_blocks, + e.get_start(), e.get_len(), min_alloc_size, used_blocks, [&](uint64_t pos, mempool_dynamic_bitset &bs) { bs.reset(pos); } @@ -5998,7 +5998,7 @@ int BlueStore::fsck(bool deep) while (fm->enumerate_next(&offset, &length)) { bool intersects = false; apply( - offset, length, block_size, used_blocks, + offset, length, min_alloc_size, used_blocks, [&](uint64_t pos, mempool_dynamic_bitset &bs) { if (bs.test(pos)) { intersects = true; @@ -6027,8 +6027,8 @@ int BlueStore::fsck(bool deep) size_t next = used_blocks.find_next(cur); if (next != cur + 1) { derr << __func__ << " error: leaked extent 0x" << std::hex - << ((uint64_t)start * block_size) << "~" - << ((cur + 1 - start) * block_size) << std::dec + << ((uint64_t)start * min_alloc_size) << "~" + << ((cur + 1 - start) * min_alloc_size) << std::dec << dendl; start = next; break;