From 5b47ac59816894e983a98d3da8b5415d569c6663 Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@redhat.com>
Date: Fri, 8 Sep 2017 18:05:29 -0400
Subject: [PATCH 1/5] os/bluestore: require that bluefs_alloc_size be multiple
 of min_alloc_size

Signed-off-by: Sage Weil <sage@redhat.com>
---
 src/os/bluestore/BlueStore.cc | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/src/os/bluestore/BlueStore.cc b/src/os/bluestore/BlueStore.cc
index 868c0f6a4db..9c2a8c72fe7 100644
--- a/src/os/bluestore/BlueStore.cc
+++ b/src/os/bluestore/BlueStore.cc
@@ -4555,6 +4555,13 @@ int BlueStore::_open_db(bool create)
 	bdev->get_size() * (cct->_conf->bluestore_bluefs_min_ratio +
 			    cct->_conf->bluestore_bluefs_gift_ratio);
       initial = MAX(initial, cct->_conf->bluestore_bluefs_min);
+      if (cct->_conf->bluefs_alloc_size % min_alloc_size) {
+	derr << __func__ << " bluefs_alloc_size 0x" << std::hex
+	     << cct->_conf->bluefs_alloc_size << " is not a multiple of "
+	     << "min_alloc_size 0x" << min_alloc_size << std::dec << dendl;
+	r = -EINVAL;
+	goto free_bluefs;
+      }
       // align to bluefs's alloc_size
       initial = P2ROUNDUP(initial, cct->_conf->bluefs_alloc_size);
       // put bluefs in the middle of the device in case it is an HDD

From 3efde01586776b23bbac1e663ae5baf6500acee4 Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@redhat.com>
Date: Fri, 8 Sep 2017 18:06:05 -0400
Subject: [PATCH 2/5] os/bluestore: mkfs: choose min_alloc_size earlier

Signed-off-by: Sage Weil <sage@redhat.com>
---
 src/os/bluestore/BlueStore.cc | 44 +++++++++++++++++------------------
 1 file changed, 22 insertions(+), 22 deletions(-)

diff --git a/src/os/bluestore/BlueStore.cc b/src/os/bluestore/BlueStore.cc
index 9c2a8c72fe7..0a73f61f84e 100644
--- a/src/os/bluestore/BlueStore.cc
+++ b/src/os/bluestore/BlueStore.cc
@@ -5167,6 +5167,28 @@ int BlueStore::mkfs()
   if (r < 0)
     goto out_close_fsid;
 
+  // choose min_alloc_size
+  if (cct->_conf->bluestore_min_alloc_size) {
+    min_alloc_size = cct->_conf->bluestore_min_alloc_size;
+  } else {
+    assert(bdev);
+    if (bdev->is_rotational()) {
+      min_alloc_size = cct->_conf->bluestore_min_alloc_size_hdd;
+    } else {
+      min_alloc_size = cct->_conf->bluestore_min_alloc_size_ssd;
+    }
+  }
+
+  // make sure min_alloc_size is power of 2 aligned.
+  if (!ISP2(min_alloc_size)) {
+    derr << __func__ << " min_alloc_size 0x"
+	 << std::hex << min_alloc_size << std::dec
+	 << " is not power of 2 aligned!"
+	 << dendl;
+    r = -EINVAL;
+    goto out_close_bdev;
+  }
+
   r = _open_db(true);
   if (r < 0)
     goto out_close_bdev;
@@ -5184,28 +5206,6 @@ int BlueStore::mkfs()
       t->set(PREFIX_SUPER, "blobid_max", bl);
     }
 
-    // choose min_alloc_size
-    if (cct->_conf->bluestore_min_alloc_size) {
-      min_alloc_size = cct->_conf->bluestore_min_alloc_size;
-    } else {
-      assert(bdev);
-      if (bdev->is_rotational()) {
-	min_alloc_size = cct->_conf->bluestore_min_alloc_size_hdd;
-      } else {
-	min_alloc_size = cct->_conf->bluestore_min_alloc_size_ssd;
-      }
-    }
-
-    // make sure min_alloc_size is power of 2 aligned.
-    if (!ISP2(min_alloc_size)) {
-      derr << __func__ << " min_alloc_size 0x"
-           << std::hex << min_alloc_size << std::dec
-           << " is not power of 2 aligned!"
-           << dendl;
-      r = -EINVAL;
-      goto out_close_fm;
-    }
-
     {
       bufferlist bl;
       ::encode((uint64_t)min_alloc_size, bl);

From 52453d4ca223c8819f8e35f2c0b691803e74537f Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@redhat.com>
Date: Fri, 8 Sep 2017 18:07:38 -0400
Subject: [PATCH 3/5] os/bluestore/FreelistManager: create: accept min alloc
 size

Accept a block size other than bdev_block_size.  Let's call it, oh, I don't
know, min_alloc_size.

Signed-off-by: Sage Weil <sage@redhat.com>
---
 src/os/bluestore/BitmapFreelistManager.cc | 6 ++++--
 src/os/bluestore/BitmapFreelistManager.h  | 3 ++-
 src/os/bluestore/BlueStore.cc             | 2 +-
 src/os/bluestore/FreelistManager.h        | 3 ++-
 4 files changed, 9 insertions(+), 5 deletions(-)

diff --git a/src/os/bluestore/BitmapFreelistManager.cc b/src/os/bluestore/BitmapFreelistManager.cc
index 075d3eff6ef..e551acc5287 100644
--- a/src/os/bluestore/BitmapFreelistManager.cc
+++ b/src/os/bluestore/BitmapFreelistManager.cc
@@ -58,9 +58,11 @@ BitmapFreelistManager::BitmapFreelistManager(CephContext* cct,
 {
 }
 
-int BitmapFreelistManager::create(uint64_t new_size, KeyValueDB::Transaction txn)
+int BitmapFreelistManager::create(uint64_t new_size, uint64_t min_alloc_size,
+				  KeyValueDB::Transaction txn)
 {
-  bytes_per_block = cct->_conf->bdev_block_size;
+  bytes_per_block = std::max(cct->_conf->bdev_block_size,
+			     (int64_t)min_alloc_size);
   assert(ISP2(bytes_per_block));
   size = P2ALIGN(new_size, bytes_per_block);
   blocks_per_key = cct->_conf->bluestore_freelist_blocks_per_key;
diff --git a/src/os/bluestore/BitmapFreelistManager.h b/src/os/bluestore/BitmapFreelistManager.h
index 9ed39ff5653..cb10c63d98a 100644
--- a/src/os/bluestore/BitmapFreelistManager.h
+++ b/src/os/bluestore/BitmapFreelistManager.h
@@ -51,7 +51,8 @@ public:
 
   static void setup_merge_operator(KeyValueDB *db, string prefix);
 
-  int create(uint64_t size, KeyValueDB::Transaction txn) override;
+  int create(uint64_t size, uint64_t min_alloc_size,
+	     KeyValueDB::Transaction txn) override;
 
   int init() override;
   void shutdown() override;
diff --git a/src/os/bluestore/BlueStore.cc b/src/os/bluestore/BlueStore.cc
index 0a73f61f84e..3a53d038f10 100644
--- a/src/os/bluestore/BlueStore.cc
+++ b/src/os/bluestore/BlueStore.cc
@@ -4169,7 +4169,7 @@ int BlueStore::_open_fm(bool create)
       bl.append(freelist_type);
       t->set(PREFIX_SUPER, "freelist_type", bl);
     }
-    fm->create(bdev->get_size(), t);
+    fm->create(bdev->get_size(), cct->_conf->bdev_block_size, t);
 
     // allocate superblock reserved space.  note that we do not mark
     // bluefs space as allocated in the freelist; we instead rely on
diff --git a/src/os/bluestore/FreelistManager.h b/src/os/bluestore/FreelistManager.h
index 7f5ad4d79f9..8f7aacbf2e1 100644
--- a/src/os/bluestore/FreelistManager.h
+++ b/src/os/bluestore/FreelistManager.h
@@ -24,7 +24,8 @@ public:
 
   static void setup_merge_operators(KeyValueDB *db);
 
-  virtual int create(uint64_t size, KeyValueDB::Transaction txn) = 0;
+  virtual int create(uint64_t size, uint64_t min_alloc_size,
+		     KeyValueDB::Transaction txn) = 0;
 
   virtual int init() = 0;
   virtual void shutdown() = 0;

From 0c777efdcb2ee5a6322f0eb277e681d0f086e0b6 Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@redhat.com>
Date: Fri, 8 Sep 2017 18:08:07 -0400
Subject: [PATCH 4/5] os/bluestore: align bluefs_extents to min_alloc_size

Signed-off-by: Sage Weil <sage@redhat.com>
---
 src/os/bluestore/BlueStore.cc | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/src/os/bluestore/BlueStore.cc b/src/os/bluestore/BlueStore.cc
index 3a53d038f10..1e24c7b521b 100644
--- a/src/os/bluestore/BlueStore.cc
+++ b/src/os/bluestore/BlueStore.cc
@@ -4174,13 +4174,14 @@ int BlueStore::_open_fm(bool create)
     // allocate superblock reserved space.  note that we do not mark
     // bluefs space as allocated in the freelist; we instead rely on
     // bluefs_extents.
-    fm->allocate(0, SUPER_RESERVED, t);
+    uint64_t reserved = ROUND_UP_TO(MAX(SUPER_RESERVED, min_alloc_size),
+				    min_alloc_size);
+    fm->allocate(0, reserved, t);
 
-    uint64_t reserved = 0;
     if (cct->_conf->bluestore_bluefs) {
       assert(bluefs_extents.num_intervals() == 1);
       interval_set<uint64_t>::iterator p = bluefs_extents.begin();
-      reserved = p.get_start() + p.get_len();
+      reserved = ROUND_UP_TO(p.get_start() + p.get_len(), min_alloc_size);
       dout(20) << __func__ << " reserved 0x" << std::hex << reserved << std::dec
 	       << " for bluefs" << dendl;
       bufferlist bl;
@@ -4188,8 +4189,6 @@ int BlueStore::_open_fm(bool create)
       t->set(PREFIX_SUPER, "bluefs_extents", bl);
       dout(20) << __func__ << " bluefs_extents 0x" << std::hex << bluefs_extents
 	       << std::dec << dendl;
-    } else {
-      reserved = SUPER_RESERVED;
     }
 
     if (cct->_conf->bluestore_debug_prefill > 0) {

From 6b8e4d512604095fb8a209229d4633ac19b499de Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@redhat.com>
Date: Fri, 8 Sep 2017 18:08:51 -0400
Subject: [PATCH 5/5] os/bluestore: use min_alloc_size for freelist resolution

For HDD with min_alloc_size=64k, this is a 16x reduction in allocation
metadata!

Signed-off-by: Sage Weil <sage@redhat.com>
---
 src/os/bluestore/BlueStore.cc | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/src/os/bluestore/BlueStore.cc b/src/os/bluestore/BlueStore.cc
index 1e24c7b521b..7b9cfaab0b2 100644
--- a/src/os/bluestore/BlueStore.cc
+++ b/src/os/bluestore/BlueStore.cc
@@ -4169,7 +4169,7 @@ int BlueStore::_open_fm(bool create)
       bl.append(freelist_type);
       t->set(PREFIX_SUPER, "freelist_type", bl);
     }
-    fm->create(bdev->get_size(), cct->_conf->bdev_block_size, t);
+    fm->create(bdev->get_size(), min_alloc_size, t);
 
     // allocate superblock reserved space.  note that we do not mark
     // bluefs space as allocated in the freelist; we instead rely on
@@ -5460,7 +5460,7 @@ int BlueStore::_fsck_check_extents(
     }
     bool already = false;
     apply(
-      e.offset, e.length, block_size, used_blocks,
+      e.offset, e.length, min_alloc_size, used_blocks,
       [&](uint64_t pos, mempool_dynamic_bitset &bs) {
 	if (bs.test(pos))
 	  already = true;
@@ -5562,9 +5562,9 @@ int BlueStore::fsck(bool deep)
   if (r < 0)
     goto out_scan;
 
-  used_blocks.resize(bdev->get_size() / block_size);
+  used_blocks.resize(bdev->get_size() / min_alloc_size);
   apply(
-    0, SUPER_RESERVED, block_size, used_blocks,
+    0, MAX(min_alloc_size, SUPER_RESERVED), min_alloc_size, used_blocks,
     [&](uint64_t pos, mempool_dynamic_bitset &bs) {
       bs.set(pos);
     }
@@ -5573,7 +5573,7 @@ int BlueStore::fsck(bool deep)
   if (bluefs) {
     for (auto e = bluefs_extents.begin(); e != bluefs_extents.end(); ++e) {
       apply(
-        e.get_start(), e.get_len(), block_size, used_blocks,
+        e.get_start(), e.get_len(), min_alloc_size, used_blocks,
         [&](uint64_t pos, mempool_dynamic_bitset &bs) {
           bs.set(pos);
         }
@@ -5972,7 +5972,7 @@ int BlueStore::fsck(bool deep)
 	       << " released 0x" << std::hex << wt.released << std::dec << dendl;
       for (auto e = wt.released.begin(); e != wt.released.end(); ++e) {
         apply(
-          e.get_start(), e.get_len(), block_size, used_blocks,
+          e.get_start(), e.get_len(), min_alloc_size, used_blocks,
           [&](uint64_t pos, mempool_dynamic_bitset &bs) {
             bs.set(pos);
           }
@@ -5987,7 +5987,7 @@ int BlueStore::fsck(bool deep)
     // know they are allocated.
     for (auto e = bluefs_extents.begin(); e != bluefs_extents.end(); ++e) {
       apply(
-        e.get_start(), e.get_len(), block_size, used_blocks,
+        e.get_start(), e.get_len(), min_alloc_size, used_blocks,
         [&](uint64_t pos, mempool_dynamic_bitset &bs) {
 	  bs.reset(pos);
         }
@@ -5998,7 +5998,7 @@ int BlueStore::fsck(bool deep)
     while (fm->enumerate_next(&offset, &length)) {
       bool intersects = false;
       apply(
-        offset, length, block_size, used_blocks,
+        offset, length, min_alloc_size, used_blocks,
         [&](uint64_t pos, mempool_dynamic_bitset &bs) {
           if (bs.test(pos)) {
             intersects = true;
@@ -6027,8 +6027,8 @@ int BlueStore::fsck(bool deep)
 	  size_t next = used_blocks.find_next(cur);
 	  if (next != cur + 1) {
 	    derr << __func__ << " error: leaked extent 0x" << std::hex
-		 << ((uint64_t)start * block_size) << "~"
-		 << ((cur + 1 - start) * block_size) << std::dec
+		 << ((uint64_t)start * min_alloc_size) << "~"
+		 << ((cur + 1 - start) * min_alloc_size) << std::dec
 		 << dendl;
 	    start = next;
 	    break;