From e6046074bb4bd7dca0100265a62b2b2706bb0f66 Mon Sep 17 00:00:00 2001 From: Abutalib Aghayev Date: Tue, 30 Jun 2020 13:00:24 -0400 Subject: [PATCH] os/bluestore: Introduce zoned freelist manager. Introduced ZonedFreelistManager, which tracks the write pointer and the number of dead bytes per zone. Signed-off-by: Abutalib Aghayev --- src/os/CMakeLists.txt | 2 + src/os/bluestore/Allocator.h | 6 +- src/os/bluestore/BlueStore.cc | 38 +-- src/os/bluestore/BlueStore.h | 14 + src/os/bluestore/FreelistManager.cc | 25 +- src/os/bluestore/FreelistManager.h | 7 +- src/os/bluestore/ZonedAllocator.cc | 96 ++++--- src/os/bluestore/ZonedAllocator.h | 36 ++- src/os/bluestore/ZonedFreelistManager.cc | 315 +++++++++++++++++++++++ src/os/bluestore/ZonedFreelistManager.h | 106 ++++++++ src/os/bluestore/zoned_types.cc | 24 ++ src/os/bluestore/zoned_types.h | 44 ++++ 12 files changed, 622 insertions(+), 91 deletions(-) create mode 100644 src/os/bluestore/ZonedFreelistManager.cc create mode 100644 src/os/bluestore/ZonedFreelistManager.h create mode 100644 src/os/bluestore/zoned_types.cc create mode 100644 src/os/bluestore/zoned_types.h diff --git a/src/os/CMakeLists.txt b/src/os/CMakeLists.txt index 2611a5a7d1a..678c2bfd5d6 100644 --- a/src/os/CMakeLists.txt +++ b/src/os/CMakeLists.txt @@ -39,6 +39,8 @@ endif(WITH_BLUESTORE) if(HAVE_LIBZBC) list(APPEND libos_srcs + bluestore/zoned_types.cc + bluestore/ZonedFreelistManager.cc bluestore/ZonedAllocator.cc) endif() diff --git a/src/os/bluestore/Allocator.h b/src/os/bluestore/Allocator.h index 9ea1e3f2d37..2104c2cc113 100644 --- a/src/os/bluestore/Allocator.h +++ b/src/os/bluestore/Allocator.h @@ -12,10 +12,11 @@ #ifndef CEPH_OS_BLUESTORE_ALLOCATOR_H #define CEPH_OS_BLUESTORE_ALLOCATOR_H +#include #include #include "include/ceph_assert.h" -#include "os/bluestore/bluestore_types.h" -#include +#include "bluestore_types.h" +#include "zoned_types.h" class Allocator { public: @@ -48,6 +49,7 @@ public: virtual void dump() = 0; virtual void dump(std::function notify) = 0; + virtual void set_zone_states(std::vector &&_zone_states) {} virtual void init_add_free(uint64_t offset, uint64_t length) = 0; virtual void init_rm_free(uint64_t offset, uint64_t length) = 0; diff --git a/src/os/bluestore/BlueStore.cc b/src/os/bluestore/BlueStore.cc index 3cf0ba95e6d..409088ebf3c 100644 --- a/src/os/bluestore/BlueStore.cc +++ b/src/os/bluestore/BlueStore.cc @@ -117,6 +117,8 @@ const string PREFIX_DEFERRED = "L"; // id -> deferred_transaction_t const string PREFIX_ALLOC = "B"; // u64 offset -> u64 length (freelist) const string PREFIX_ALLOC_BITMAP = "b";// (see BitmapFreelistManager) const string PREFIX_SHARED_BLOB = "X"; // u64 offset -> shared_blob_t +const string PREFIX_ZONED_META = "Z"; // (see ZonedFreelistManager) +const string PREFIX_ZONED_INFO = "z"; // (see ZonedFreelistManager) const string BLUESTORE_GLOBAL_STATFS_KEY = "bluestore_statfs"; @@ -4857,6 +4859,10 @@ int BlueStore::_open_bdev(bool create) if (r < 0) { goto fail_close; } + + if (bdev->is_smr()) { + freelist_type = "zoned"; + } return 0; fail_close: @@ -4910,7 +4916,13 @@ int BlueStore::_open_fm(KeyValueDB::Transaction t, bool read_only) // being able to allocate in units less than bdev block size // seems to be a bad idea. ceph_assert( cct->_conf->bdev_block_size <= (int64_t)min_alloc_size); - fm->create(bdev->get_size(), (int64_t)min_alloc_size, t); + + uint64_t alloc_size = min_alloc_size; + if (bdev->is_smr()) { + alloc_size = _piggyback_zoned_device_parameters_onto(alloc_size); + } + + fm->create(bdev->get_size(), alloc_size, t); // allocate superblock reserved space. note that we do not mark // bluefs space as allocated in the freelist; we instead rely on @@ -5041,7 +5053,9 @@ int BlueStore::_open_alloc() << dendl; } + uint64_t alloc_size = min_alloc_size; if (bdev->is_smr()) { + alloc_size = _piggyback_zoned_device_parameters_onto(alloc_size); if (cct->_conf->bluestore_allocator != "zoned") { dout(1) << __func__ << " The drive is HM-SMR but " << cct->_conf->bluestore_allocator << " allocator is specified. " @@ -5067,25 +5081,11 @@ int BlueStore::_open_alloc() << "Please set to 0." << dendl; return -EINVAL; } - - // For now, to avoid interface changes we piggyback zone_size (in MiB) and - // the first sequential zone number onto min_alloc_size and pass it to - // Allocator::create. - uint64_t zone_size = bdev->get_zone_size(); - uint64_t zone_size_mb = zone_size / (1024 * 1024); - uint64_t first_seq_zone = bdev->get_conventional_region_size() / zone_size; - - min_alloc_size |= (zone_size_mb << 32); - min_alloc_size |= (first_seq_zone << 48); } alloc = Allocator::create(cct, cct->_conf->bluestore_allocator, bdev->get_size(), - min_alloc_size, "block"); - - if (bdev->is_smr()) { - min_alloc_size &= 0x00000000ffffffff; - } + alloc_size, "block"); if (!alloc) { lderr(cct) << __func__ << " Allocator::unknown alloc type " @@ -5094,6 +5094,10 @@ int BlueStore::_open_alloc() return -EINVAL; } + if (bdev->is_smr()) { + alloc->set_zone_states(fm->get_zone_states(db)); + } + uint64_t num = 0, bytes = 0; dout(1) << __func__ << " opening allocation metadata" << dendl; @@ -5845,7 +5849,7 @@ int BlueStore::_prepare_db_environment(bool create, bool read_only, return -EIO; } - FreelistManager::setup_merge_operators(db); + FreelistManager::setup_merge_operators(db, freelist_type); db->set_merge_operator(PREFIX_STAT, merge_op); db->set_cache_size(cache_kv_ratio * cache_size); return 0; diff --git a/src/os/bluestore/BlueStore.h b/src/os/bluestore/BlueStore.h index 91503e4c4d4..4d1a4269fe1 100644 --- a/src/os/bluestore/BlueStore.h +++ b/src/os/bluestore/BlueStore.h @@ -2339,6 +2339,20 @@ private: int _setup_block_symlink_or_file(std::string name, std::string path, uint64_t size, bool create); + // Functions related to zoned storage. + + // For now, to avoid interface changes we piggyback zone_size (in MiB) and the + // first sequential zone number onto min_alloc_size and pass it to functions + // Allocator::create and FreelistManager::create. + uint64_t _piggyback_zoned_device_parameters_onto(uint64_t min_alloc_size) { + uint64_t zone_size = bdev->get_zone_size(); + uint64_t zone_size_mb = zone_size / (1024 * 1024); + uint64_t first_seq_zone = bdev->get_conventional_region_size() / zone_size; + min_alloc_size |= (zone_size_mb << 32); + min_alloc_size |= (first_seq_zone << 48); + return min_alloc_size; + } + public: utime_t get_deferred_last_submitted() { std::lock_guard l(deferred_lock); diff --git a/src/os/bluestore/FreelistManager.cc b/src/os/bluestore/FreelistManager.cc index ba34c0e4de4..652cb9251ba 100644 --- a/src/os/bluestore/FreelistManager.cc +++ b/src/os/bluestore/FreelistManager.cc @@ -3,6 +3,9 @@ #include "FreelistManager.h" #include "BitmapFreelistManager.h" +#ifdef HAVE_LIBZBC +#include "ZonedFreelistManager.h" +#endif FreelistManager *FreelistManager::create( CephContext* cct, @@ -16,10 +19,28 @@ FreelistManager *FreelistManager::create( ceph_assert(prefix == "B"); if (type == "bitmap") return new BitmapFreelistManager(cct, "B", "b"); + +#ifdef HAVE_LIBZBC + // With zoned drives there is only one FreelistManager implementation that we + // can use, and we also know if a drive is zoned right after opening it + // (BlueStore::_open_bdev). Hence, we set freelist_type to "zoned" whenever + // we open the device and it turns out to be is zoned. We ignore |prefix| + // passed to create and use the prefixes defined for zoned devices at the top + // of BlueStore.cc. + if (type == "zoned") + return new ZonedFreelistManager(cct, "Z", "z"); +#endif + return NULL; } -void FreelistManager::setup_merge_operators(KeyValueDB *db) +void FreelistManager::setup_merge_operators(KeyValueDB *db, + const std::string& type) { - BitmapFreelistManager::setup_merge_operator(db, "b"); +#ifdef HAVE_LIBZBC + if (type == "zoned") + ZonedFreelistManager::setup_merge_operator(db, "z"); + else +#endif + BitmapFreelistManager::setup_merge_operator(db, "b"); } diff --git a/src/os/bluestore/FreelistManager.h b/src/os/bluestore/FreelistManager.h index 911a07aa272..1aaff89eacb 100644 --- a/src/os/bluestore/FreelistManager.h +++ b/src/os/bluestore/FreelistManager.h @@ -10,6 +10,7 @@ #include #include "kv/KeyValueDB.h" #include "bluestore_types.h" +#include "zoned_types.h" class FreelistManager { public: @@ -22,7 +23,7 @@ public: std::string type, std::string prefix); - static void setup_merge_operators(KeyValueDB *db); + static void setup_merge_operators(KeyValueDB *db, const std::string &type); virtual int create(uint64_t size, uint64_t granularity, KeyValueDB::Transaction txn) = 0; @@ -50,6 +51,10 @@ public: virtual void get_meta(uint64_t target_size, std::vector>*) const = 0; + + virtual std::vector get_zone_states(KeyValueDB *kvdb) const { + return {}; + } }; diff --git a/src/os/bluestore/ZonedAllocator.cc b/src/os/bluestore/ZonedAllocator.cc index 7386f4ff06e..3a80593f40c 100644 --- a/src/os/bluestore/ZonedAllocator.cc +++ b/src/os/bluestore/ZonedAllocator.cc @@ -10,6 +10,7 @@ #include "ZonedAllocator.h" #include "bluestore_types.h" +#include "zoned_types.h" #include "common/debug.h" #define dout_context cct @@ -25,15 +26,17 @@ ZonedAllocator::ZonedAllocator(CephContext* cct, cct(cct), num_free(0), size(size), + // To avoid interface changes, we piggyback zone size and the first + // sequential zone number onto the first 32 bits of 64-bit |block_size|. + // The last 32 bits of |block_size| is holding the actual block size. block_size((block_size & 0x00000000ffffffff)), zone_size(((block_size & 0x0000ffff00000000) >> 32) * 1024 * 1024), - starting_zone((block_size & 0xffff000000000000) >> 48), - nr_zones(size / zone_size), - write_pointers(nr_zones) { + starting_zone_num((block_size & 0xffff000000000000) >> 48), + num_zones(size / zone_size) { ldout(cct, 10) << __func__ << " size 0x" << std::hex << size << " zone size 0x" << zone_size << std::dec - << " number of zones " << nr_zones - << " first sequential zone " << starting_zone + << " number of zones " << num_zones + << " first sequential zone " << starting_zone_num << dendl; ceph_assert(size % zone_size == 0); } @@ -53,37 +56,40 @@ int64_t ZonedAllocator::allocate( ldout(cct, 10) << __func__ << " trying to allocate " << std::hex << want_size << dendl; - uint64_t zone = starting_zone; - for ( ; zone < nr_zones; ++zone) { - if (fits(want_size, zone)) + uint64_t zone_num = starting_zone_num; + for ( ; zone_num < num_zones; ++zone_num) { + if (fits(want_size, zone_num)) { break; - ldout(cct, 10) << __func__ << " skipping zone " << zone + } + ldout(cct, 10) << __func__ << " skipping zone " << zone_num << " because there is not enough space: " << " want_size = " << want_size - << " available = " << zone_free_space(zone) << dendl; + << " available = " << get_remaining_space(zone_num) + << dendl; } - if (zone == nr_zones) { + if (zone_num == num_zones) { ldout(cct, 10) << __func__ << " failed to allocate" << dendl; return -ENOSPC; } - uint64_t offset = zone_offset(zone); - ldout(cct, 10) << __func__ << " advancing zone " << zone - << " write pointer from " << std::hex << offset - << " to " << offset + want_size << dendl; - advance_wp(zone, want_size); + uint64_t offset = get_offset(zone_num); - if (zone_free_space(zone) == 0) { - starting_zone = zone + 1; + ldout(cct, 10) << __func__ << " advancing zone " << std::hex + << zone_num << " write pointer from " << offset + << " to " << offset + want_size << dendl; + + advance_write_pointer(zone_num, want_size); + if (get_remaining_space(zone_num) == 0) { + starting_zone_num = zone_num + 1; } - ldout(cct, 10) << __func__ << " zone " << zone << " offset is now " - << std::hex << zone_wp(zone) << dendl; + ldout(cct, 10) << __func__ << std::hex << " zone " << zone_num + << " offset is now " << get_write_pointer(zone_num) << dendl; ldout(cct, 10) << __func__ << " allocated " << std::hex << want_size << " bytes at offset " << offset - << " located at zone " << zone + << " located at zone " << zone_num << " and zone offset " << offset % zone_size << dendl; extents->emplace_back(bluestore_pextent_t(offset, want_size)); @@ -108,31 +114,14 @@ void ZonedAllocator::dump(std::function zone_size - offset) { - length -= zone_size - offset; - for (++zone; length >= zone_size; length -= zone_size) { - ldout(cct, 30) << __func__ << " set zone 0x" << std::hex - << zone << " write pointer to 0x" << 0 << dendl; - write_pointers[zone++] = 0; - } - if (length > 0) { - ldout(cct, 20) << __func__ << " set zone 0x" << std::hex - << zone << " write pointer to 0x" << 0 << dendl; - write_pointers[zone] = length; - } - } } void ZonedAllocator::init_rm_free(uint64_t offset, uint64_t length) { @@ -143,23 +132,32 @@ void ZonedAllocator::init_rm_free(uint64_t offset, uint64_t length) { num_free -= length; ceph_assert(num_free >= 0); - uint64_t zone = offset / zone_size; - offset %= zone_size; - ceph_assert(write_pointers[zone] == offset); - write_pointers[zone] = zone_size; - ldout(cct, 10) << __func__ << " set zone 0x" << std::hex - << zone << " write pointer to 0x" << zone_size << dendl; + uint64_t zone_num = offset / zone_size; + uint64_t write_pointer = offset % zone_size; + uint64_t remaining_space = get_remaining_space(zone_num); - length -= zone_size - offset; + ceph_assert(get_write_pointer(zone_num) == write_pointer); + ceph_assert(remaining_space <= length); + advance_write_pointer(zone_num, remaining_space); + + ldout(cct, 10) << __func__ << " set zone 0x" << std::hex + << zone_num << " write pointer to 0x" << zone_size << dendl; + + length -= remaining_space; ceph_assert(length % zone_size == 0); for ( ; length; length -= zone_size) { - write_pointers[++zone] = zone_size; + advance_write_pointer(++zone_num, zone_size); ldout(cct, 10) << __func__ << " set zone 0x" << std::hex - << zone << " write pointer to 0x" << zone_size << dendl; + << zone_num << " write pointer to 0x" << zone_size << dendl; } } +void ZonedAllocator::set_zone_states(std::vector &&_zone_states) { + std::lock_guard l(lock); + ldout(cct, 10) << __func__ << dendl; + zone_states = std::move(_zone_states); +} void ZonedAllocator::shutdown() { ldout(cct, 1) << __func__ << dendl; diff --git a/src/os/bluestore/ZonedAllocator.h b/src/os/bluestore/ZonedAllocator.h index 2851c6e466a..22b40221f7b 100644 --- a/src/os/bluestore/ZonedAllocator.h +++ b/src/os/bluestore/ZonedAllocator.h @@ -18,7 +18,8 @@ #include "include/btree_map.h" #include "include/interval_set.h" #include "include/mempool.h" -#include "os/bluestore/bluestore_types.h" +#include "bluestore_types.h" +#include "zoned_types.h" class ZonedAllocator : public Allocator { CephContext* cct; @@ -33,34 +34,28 @@ class ZonedAllocator : public Allocator { uint64_t size; uint64_t block_size; uint64_t zone_size; - uint64_t starting_zone; - uint64_t nr_zones; - std::vector write_pointers; + uint64_t starting_zone_num; + uint64_t num_zones; + std::vector zone_states; - inline uint64_t zone_offset(uint64_t zone) { - ceph_assert(zone < nr_zones); - return zone * zone_size + zone_wp(zone); + inline uint64_t get_offset(uint64_t zone_num) const { + return zone_num * zone_size + get_write_pointer(zone_num); } - inline uint64_t zone_wp(uint64_t zone) { - ceph_assert(zone < nr_zones); - return write_pointers[zone]; + inline uint64_t get_write_pointer(uint64_t zone_num) const { + return zone_states[zone_num].get_write_pointer(); } - inline uint64_t zone_free_space(uint64_t zone) { - ceph_assert(zone < nr_zones); - return zone_size - zone_wp(zone); + inline uint64_t get_remaining_space(uint64_t zone_num) const { + return zone_size - get_write_pointer(zone_num); } - inline void advance_wp(uint64_t zone, uint64_t size) { - ceph_assert(zone < nr_zones); - write_pointers[zone] += size; - ceph_assert(write_pointers[zone] <= zone_size); + inline void advance_write_pointer(uint64_t zone_num, uint64_t want_size) { + zone_states[zone_num].increment_write_pointer(want_size); } - inline bool fits(uint64_t want_size, uint64_t zone) { - ceph_assert(zone < nr_zones); - return want_size <= zone_free_space(zone); + inline bool fits(uint64_t want_size, uint64_t zone_num) const { + return want_size <= get_remaining_space(zone_num); } public: @@ -80,6 +75,7 @@ public: void dump(std::function notify) override; + void set_zone_states(std::vector &&_zone_states) override; void init_add_free(uint64_t offset, uint64_t length) override; void init_rm_free(uint64_t offset, uint64_t length) override; diff --git a/src/os/bluestore/ZonedFreelistManager.cc b/src/os/bluestore/ZonedFreelistManager.cc new file mode 100644 index 00000000000..b135ee5242a --- /dev/null +++ b/src/os/bluestore/ZonedFreelistManager.cc @@ -0,0 +1,315 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +// +// A freelist manager for zoned devices. This iteration just keeps the write +// pointer per zone. Following iterations will add enough information to enable +// cleaning of zones. +// +// Copyright (C) 2020 Abutalib Aghayev +// + +#include "ZonedFreelistManager.h" +#include "bluestore_common.h" +#include "include/stringify.h" +#include "kv/KeyValueDB.h" +#include "os/kv.h" +#include "zoned_types.h" + +#include "common/debug.h" + +#define dout_context cct +#define dout_subsys ceph_subsys_bluestore +#undef dout_prefix +#define dout_prefix *_dout << "zoned freelist " + +using std::string; + +using ceph::bufferlist; +using ceph::bufferptr; +using ceph::decode; +using ceph::encode; + +void ZonedFreelistManager::write_zone_state_to_db( + uint64_t zone_num, + const zone_state_t &zone_state, + KeyValueDB::Transaction txn) { + string key; + _key_encode_u64(zone_num, &key); + bufferlist bl; + zone_state.encode(bl); + txn->merge(info_prefix, key, bl); +} + +void ZonedFreelistManager::load_zone_state_from_db( + uint64_t zone_num, + zone_state_t &zone_state, + KeyValueDB::Iterator& it) const { + string k = it->key(); + uint64_t zone_num_from_db; + _key_decode_u64(k.c_str(), &zone_num_from_db); + ceph_assert(zone_num_from_db == zone_num); + + bufferlist bl = it->value(); + auto p = bl.cbegin(); + zone_state.decode(p); +} + +void ZonedFreelistManager::init_zone_states(KeyValueDB::Transaction txn) { + dout(10) << __func__ << dendl; + for (uint64_t zone_num = 0; zone_num < num_zones; ++zone_num) { + zone_state_t zone_state; + write_zone_state_to_db(zone_num, zone_state, txn); + } +} + +void ZonedFreelistManager::setup_merge_operator(KeyValueDB *db, string prefix) { + std::shared_ptr merge_op( + new Int64ArrayMergeOperator); + db->set_merge_operator(prefix, merge_op); +} + +ZonedFreelistManager::ZonedFreelistManager( + CephContext* cct, + string meta_prefix, + string info_prefix) + : FreelistManager(cct), + meta_prefix(meta_prefix), + info_prefix(info_prefix), + enumerate_zone_num(~0UL) {} + +int ZonedFreelistManager::create( + uint64_t new_size, + uint64_t granularity, + KeyValueDB::Transaction txn) { + // To avoid interface changes, we piggyback zone size and the first sequential + // zone number onto the first 32 bits of 64-bit |granularity|. The last 32 + // bits of |granularity| is holding the actual allocation granularity, which + // is bytes_per_block. + size = new_size; + bytes_per_block = granularity & 0x00000000ffffffff; + zone_size = ((granularity & 0x0000ffff00000000) >> 32) * 1024 * 1024; + num_zones = size / zone_size; + starting_zone_num = (granularity & 0xffff000000000000) >> 48; + enumerate_zone_num = ~0UL; + + ceph_assert(size % zone_size == 0); + + dout(1) << __func__ << std::hex + << " size 0x" << size + << " bytes_per_block 0x" << bytes_per_block + << " zone size 0x " << zone_size + << " num_zones 0x" << num_zones + << " starting_zone 0x" << starting_zone_num << dendl; + { + bufferlist bl; + encode(size, bl); + txn->set(meta_prefix, "size", bl); + } + { + bufferlist bl; + encode(bytes_per_block, bl); + txn->set(meta_prefix, "bytes_per_block", bl); + } + { + bufferlist bl; + encode(zone_size, bl); + txn->set(meta_prefix, "zone_size", bl); + } + { + bufferlist bl; + encode(num_zones, bl); + txn->set(meta_prefix, "num_zones", bl); + } + { + bufferlist bl; + encode(starting_zone_num, bl); + txn->set(meta_prefix, "starting_zone_num", bl); + } + + init_zone_states(txn); + + return 0; +} + +int ZonedFreelistManager::init( + KeyValueDB *kvdb, + bool db_in_read_only, + cfg_reader_t cfg_reader) { + dout(1) << __func__ << dendl; + int r = _read_cfg(cfg_reader); + if (r != 0) { + return r; + } + + ceph_assert(num_zones == size / zone_size); + + dout(10) << __func__ << std::hex + << " size 0x" << size + << " bytes_per_block 0x" << bytes_per_block + << " zone size 0x" << zone_size + << " num_zones 0x" << num_zones + << " starting_zone 0x" << starting_zone_num + << std::dec << dendl; + return 0; +} + +void ZonedFreelistManager::sync(KeyValueDB* kvdb) {} + +void ZonedFreelistManager::shutdown() { + dout(1) << __func__ << dendl; +} + +void ZonedFreelistManager::enumerate_reset() { + std::lock_guard l(lock); + + dout(1) << __func__ << dendl; + + enumerate_p.reset(); + enumerate_zone_num = ~0UL; +} + +// Currently, this just iterates over the list of zones and sets |offset| and +// |length| to the write pointer and the number of remaining free bytes in a +// given zone. Hence, it can set |length| to 0 if a zone is full, and it can +// also return two contiguous empty zones in two calls. This does not violate +// current semantics of the call and appears to work fine with the clients of +// this call. +bool ZonedFreelistManager::enumerate_next( + KeyValueDB *kvdb, + uint64_t *offset, + uint64_t *length) { + std::lock_guard l(lock); + + // starting case + if (enumerate_zone_num == ~0UL) { + dout(30) << __func__ << " start" << dendl; + enumerate_p = kvdb->get_iterator(info_prefix); + enumerate_p->lower_bound(string()); + ceph_assert(enumerate_p->valid()); + enumerate_zone_num = 0; + } else { + enumerate_p->next(); + if (!enumerate_p->valid()) { + dout(30) << __func__ << " end" << dendl; + return false; + } + ++enumerate_zone_num; + } + + zone_state_t zone_state; + load_zone_state_from_db(enumerate_zone_num, zone_state, enumerate_p); + + *offset = enumerate_zone_num * zone_size + zone_state.get_write_pointer(); + *length = zone_size - zone_state.get_write_pointer(); + + dout(30) << __func__ << std::hex << " 0x" << *offset << "~" << *length + << std::dec << dendl; + + return true; +} + +void ZonedFreelistManager::dump(KeyValueDB *kvdb) { + enumerate_reset(); + uint64_t offset, length; + while (enumerate_next(kvdb, &offset, &length)) { + dout(20) << __func__ << " 0x" << std::hex << offset << "~" << length + << std::dec << dendl; + } +} + +// Advances the write pointer and writes the updated write pointer to database. +void ZonedFreelistManager::allocate( + uint64_t offset, + uint64_t length, + KeyValueDB::Transaction txn) { + dout(10) << __func__ << " 0x" << std::hex << offset << "~" << length << dendl; + uint64_t zone_num = offset / zone_size; + zone_state_t zone_state; + zone_state.increment_write_pointer(length); + write_zone_state_to_db(zone_num, zone_state, txn); +} + +// Increments the number of dead bytes in a zone and writes the updated value to +// database. The dead bytes in the zone are not usable. The cleaner will later +// copy live objects from the zone to another zone an make the zone writable +// again. The number of dead bytes in a zone is used by the cleaner to select +// which zones to clean -- the ones with most dead bytes are good candidates +// since they require less I/O. +void ZonedFreelistManager::release( + uint64_t offset, + uint64_t length, + KeyValueDB::Transaction txn) { + dout(10) << __func__ << " 0x" << std::hex << offset << "~" << length << dendl; + uint64_t zone_num = offset / zone_size; + zone_state_t zone_state; + zone_state.increment_num_dead_bytes(length); + write_zone_state_to_db(zone_num, zone_state, txn); +} + +void ZonedFreelistManager::get_meta( + uint64_t target_size, + std::vector>* res) const { + // We do not support expanding devices for now. + ceph_assert(target_size == 0); + res->emplace_back("zfm_size", stringify(size)); + res->emplace_back("zfm_bytes_per_block", stringify(bytes_per_block)); + res->emplace_back("zfm_zone_size", stringify(zone_size)); + res->emplace_back("zfm_num_zones", stringify(num_zones)); + res->emplace_back("zfm_starting_zone_num", stringify(starting_zone_num)); +} + +std::vector ZonedFreelistManager::get_zone_states( + KeyValueDB *kvdb) const { + std::vector zone_states; + auto p = kvdb->get_iterator(info_prefix); + uint64_t zone_num = 0; + for (p->lower_bound(string()); p->valid(); p->next(), ++zone_num) { + zone_state_t zone_state; + load_zone_state_from_db(zone_num, zone_state, p); + zone_states.emplace_back(zone_state); + } + return zone_states; +} + +// TODO: The following function is copied almost verbatim from +// BitmapFreelistManager. Eliminate duplication. +int ZonedFreelistManager::_read_cfg(cfg_reader_t cfg_reader) { + dout(1) << __func__ << dendl; + + string err; + + const size_t key_count = 5; + string keys[key_count] = { + "zfm_size", + "zfm_bytes_per_block", + "zfm_zone_size", + "zfm_num_zones", + "zfm_starting_zone_num" + }; + uint64_t* vals[key_count] = { + &size, + &bytes_per_block, + &zone_size, + &num_zones, + &starting_zone_num}; + + for (size_t i = 0; i < key_count; i++) { + string val; + int r = cfg_reader(keys[i], &val); + if (r == 0) { + *(vals[i]) = strict_iecstrtoll(val.c_str(), &err); + if (!err.empty()) { + derr << __func__ << " Failed to parse - " + << keys[i] << ":" << val + << ", error: " << err << dendl; + return -EINVAL; + } + } else { + // this is expected for legacy deployed OSDs + dout(0) << __func__ << " " << keys[i] << " not found in bdev meta" << dendl; + return r; + } + } + return 0; +} diff --git a/src/os/bluestore/ZonedFreelistManager.h b/src/os/bluestore/ZonedFreelistManager.h new file mode 100644 index 00000000000..ec08f311332 --- /dev/null +++ b/src/os/bluestore/ZonedFreelistManager.h @@ -0,0 +1,106 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +// +// A freelist manager for zoned devices. This iteration just keeps the write +// pointer per zone. Following iterations will add enough information to enable +// cleaning of zones. +// +// Copyright (C) 2020 Abutalib Aghayev +// + +#ifndef CEPH_OS_BLUESTORE_ZONEDFREELISTMANAGER_H +#define CEPH_OS_BLUESTORE_ZONEDFREELISTMANAGER_H + +#include "FreelistManager.h" + +#include +#include + +#include "common/ceph_mutex.h" +#include "include/buffer.h" +#include "kv/KeyValueDB.h" + +using cfg_reader_t = std::function; + +class ZonedFreelistManager : public FreelistManager { + std::string meta_prefix; ///< device size, zone size, etc. + std::string info_prefix; ///< per zone write pointer, dead bytes + mutable ceph::mutex lock = ceph::make_mutex("ZonedFreelistManager::lock"); + + uint64_t size; ///< size of sequential region (bytes) + uint64_t bytes_per_block; ///< bytes per allocation unit (bytes) + uint64_t zone_size; ///< size of a single zone (bytes) + uint64_t num_zones; ///< number of sequential zones + uint64_t starting_zone_num; ///< the first sequential zone number + + KeyValueDB::Iterator enumerate_p; + uint64_t enumerate_zone_num; + + void write_zone_state_to_db(uint64_t zone_num, + const zone_state_t &zone_state, + KeyValueDB::Transaction txn); + void load_zone_state_from_db(uint64_t zone_num, + zone_state_t &zone_state, + KeyValueDB::Iterator &it) const; + + void init_zone_states(KeyValueDB::Transaction txn); + + void increment_write_pointer( + uint64_t zone, uint64_t length, KeyValueDB::Transaction txn); + void increment_num_dead_bytes( + uint64_t zone, uint64_t num_bytes, KeyValueDB::Transaction txn); + + int _read_cfg(cfg_reader_t cfg_reader); + +public: + ZonedFreelistManager(CephContext* cct, + std::string meta_prefix, + std::string info_prefix); + + static void setup_merge_operator(KeyValueDB *db, std::string prefix); + + int create(uint64_t size, + uint64_t granularity, + KeyValueDB::Transaction txn) override; + + int init(KeyValueDB *kvdb, + bool db_in_read_only, + cfg_reader_t cfg_reader) override; + + void shutdown() override; + void sync(KeyValueDB* kvdb) override; + void dump(KeyValueDB *kvdb) override; + + void enumerate_reset() override; + bool enumerate_next(KeyValueDB *kvdb, + uint64_t *offset, + uint64_t *length) override; + + void allocate(uint64_t offset, + uint64_t length, + KeyValueDB::Transaction txn) override; + + void release(uint64_t offset, + uint64_t length, + KeyValueDB::Transaction txn) override; + + inline uint64_t get_size() const override { + return size; + } + + inline uint64_t get_alloc_units() const override { + return size / bytes_per_block; + } + + inline uint64_t get_alloc_size() const override { + return bytes_per_block; + } + + void get_meta(uint64_t target_size, + std::vector>*) const override; + + std::vector get_zone_states(KeyValueDB *kvdb) const override; +}; + +#endif diff --git a/src/os/bluestore/zoned_types.cc b/src/os/bluestore/zoned_types.cc new file mode 100644 index 00000000000..f33bd89e3e4 --- /dev/null +++ b/src/os/bluestore/zoned_types.cc @@ -0,0 +1,24 @@ +#include "zoned_types.h" + +using ceph::decode; +using ceph::encode; + +std::ostream& operator<<(std::ostream& out, + const zone_state_t& zone_state) { + return out << " zone: 0x" << std::hex + << " dead bytes: 0x" << zone_state.get_num_dead_bytes() + << " write pointer: 0x" << zone_state.get_write_pointer() + << " " << std::dec; +} + +void zone_state_t::encode(ceph::buffer::list &bl) const { + uint64_t v = static_cast(num_dead_bytes) << 32 | write_pointer; + ::encode(v, bl); +} + +void zone_state_t::decode(ceph::buffer::list::const_iterator &p) { + uint64_t v; + ::decode(v, p); + num_dead_bytes = v >> 32; + write_pointer = v; // discard left-most 32 bits +} diff --git a/src/os/bluestore/zoned_types.h b/src/os/bluestore/zoned_types.h new file mode 100644 index 00000000000..6ff5d5f313e --- /dev/null +++ b/src/os/bluestore/zoned_types.h @@ -0,0 +1,44 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +#ifndef CEPH_OS_BLUESTORE_ZONED_TYPES_H +#define CEPH_OS_BLUESTORE_ZONED_TYPES_H + +#include "include/types.h" +#include "kv/KeyValueDB.h" +#include "os/kv.h" + +// Tracks two bits of information about the state of a zone: (1) number of dead +// bytes in a zone and (2) the write pointer. We assume that for now 32 bits is +// enough for the zone capacity and represent these as uint32_t, and we store +// them as a single 64-bit value in RocksDB so that we can use the existing +// Int64ArrayMergeOperator for merge and avoid the cost of point queries. +// +// We use the same struct for an on-disk and in-memory representation of the +// state. +struct zone_state_t { + uint32_t num_dead_bytes = 0; + uint32_t write_pointer = 0; + + void encode(ceph::buffer::list &bl) const; + void decode(ceph::buffer::list::const_iterator &p); + + uint64_t get_num_dead_bytes() const { + return num_dead_bytes; + } + + uint64_t get_write_pointer() const { + return write_pointer; + } + + void increment_num_dead_bytes(uint64_t num_bytes) { + num_dead_bytes += num_bytes; + } + + void increment_write_pointer(uint64_t num_bytes) { + write_pointer += num_bytes; + } +}; + +std::ostream& operator<<(std::ostream& out, const zone_state_t& zone_state); + +#endif