diff --git a/src/blk/BlockDevice.h b/src/blk/BlockDevice.h index 44ed3de4d8c..da80142ffe3 100644 --- a/src/blk/BlockDevice.h +++ b/src/blk/BlockDevice.h @@ -199,6 +199,9 @@ public: ceph_assert(is_smr()); return conventional_region_size; } + virtual void reset_zones(const std::set& zones) { + ceph_assert(is_smr()); + } virtual void aio_submit(IOContext *ioc) = 0; diff --git a/src/blk/zoned/HMSMRDevice.cc b/src/blk/zoned/HMSMRDevice.cc index dac61c879e7..045d690eaf6 100644 --- a/src/blk/zoned/HMSMRDevice.cc +++ b/src/blk/zoned/HMSMRDevice.cc @@ -412,6 +412,14 @@ void HMSMRDevice::_detect_vdo() return; } +void HMSMRDevice::reset_zones(const std::set& zones) { + for (auto zone_num : zones) { + if (zbd_reset_zones(zbd_fd, zone_num * zone_size, zone_size) != 0) { + derr << __func__ << " resetting zone failed for zone " << zone_num << dendl; + } + } +} + bool HMSMRDevice::get_thin_utilization(uint64_t *total, uint64_t *avail) const { if (vdo_fd < 0) { diff --git a/src/blk/zoned/HMSMRDevice.h b/src/blk/zoned/HMSMRDevice.h index 30941f2f9c6..fd3ebf78710 100644 --- a/src/blk/zoned/HMSMRDevice.h +++ b/src/blk/zoned/HMSMRDevice.h @@ -41,6 +41,7 @@ class HMSMRDevice final : public BlockDevice { string vdo_name; std::string devname; ///< kernel dev name (/sys/block/$devname), if any + int zbd_fd = -1; ///< fd for the zoned block device ceph::mutex debug_lock = ceph::make_mutex("HMSMRDevice::debug_lock"); interval_set debug_inflight; @@ -135,6 +136,8 @@ public: bool is_smr() const final { return true; } + void reset_zones(const std::set& zones) override; + bool get_thin_utilization(uint64_t *total, uint64_t *avail) const final; int read(uint64_t off, uint64_t len, bufferlist *pbl, diff --git a/src/os/bluestore/BlueStore.cc b/src/os/bluestore/BlueStore.cc index e93c87fc26e..7520b5de0ea 100644 --- a/src/os/bluestore/BlueStore.cc +++ b/src/os/bluestore/BlueStore.cc @@ -12376,6 +12376,19 @@ void BlueStore::_kv_finalize_thread() void BlueStore::_zoned_cleaner_start() { dout(10) << __func__ << dendl; + auto f = dynamic_cast(fm); + ceph_assert(f); + + auto zones_to_clean = f->get_cleaning_in_progress_zones(db); + if (!zones_to_clean.empty()) { + dout(10) << __func__ << " resuming cleaning after unclean shutdown." << dendl; + for (auto zone_num : zones_to_clean) { + _zoned_clean_zone(zone_num); + } + bdev->reset_zones(zones_to_clean); + f->mark_zones_to_clean_free(zones_to_clean, db); + } + zoned_cleaner_thread.create("bstore_zcleaner"); } @@ -12418,10 +12431,12 @@ void BlueStore::_zoned_cleaner_thread() { dout(20) << __func__ << " wake" << dendl; } else { l.unlock(); + f->mark_zones_to_clean_in_progress(*zones_to_clean, db); for (auto zone_num : *zones_to_clean) { _zoned_clean_zone(zone_num); } - f->mark_zones_to_clean_free(zones_to_clean, db); + bdev->reset_zones(*zones_to_clean); + f->mark_zones_to_clean_free(*zones_to_clean, db); a->mark_zones_to_clean_free(); l.lock(); } diff --git a/src/os/bluestore/ZonedFreelistManager.cc b/src/os/bluestore/ZonedFreelistManager.cc index 951bd90877b..3b31e202fae 100644 --- a/src/os/bluestore/ZonedFreelistManager.cc +++ b/src/os/bluestore/ZonedFreelistManager.cc @@ -314,16 +314,43 @@ int ZonedFreelistManager::_read_cfg(cfg_reader_t cfg_reader) { return 0; } +std::set ZonedFreelistManager::get_cleaning_in_progress_zones( + KeyValueDB *kvdb) const { + bufferlist bl; + std::set zones_to_clean; + if (kvdb->get(meta_prefix, CLEANING_IN_PROGRESS_KEY, &bl) == 0) { + decode(zones_to_clean, bl); + } + return zones_to_clean; +} + void ZonedFreelistManager::mark_zones_to_clean_free( - const std::set *zones_to_clean, KeyValueDB *kvdb) { + const std::set& zones_to_clean, KeyValueDB *kvdb) { dout(10) << __func__ << dendl; KeyValueDB::Transaction txn = kvdb->get_transaction(); - for (auto zone_num : *zones_to_clean) { + for (auto zone_num : zones_to_clean) { ldout(cct, 10) << __func__ << " zone " << zone_num << " is now clean in DB" << dendl; zone_state_t zone_state; write_zone_state_to_db(zone_num, zone_state, txn); } + txn->rmkey(meta_prefix, CLEANING_IN_PROGRESS_KEY); + kvdb->submit_transaction_sync(txn); +} + +// Marks the zones currently being cleaned in the db. Should be called before +// starting the cleaning. If we crash mid-cleaning, the recovery code will check +// if there is a key CLEANING_IN_PROGRESS_KEY in the meta_prefix namespace, and +// if so, will read the zones and resume cleaning. +void ZonedFreelistManager::mark_zones_to_clean_in_progress( + const std::set& zones_to_clean, KeyValueDB *kvdb) { + dout(10) << __func__ << dendl; + + bufferlist bl; + encode(zones_to_clean, bl); + + KeyValueDB::Transaction txn = kvdb->get_transaction(); + txn->set(meta_prefix, CLEANING_IN_PROGRESS_KEY, bl); kvdb->submit_transaction_sync(txn); } diff --git a/src/os/bluestore/ZonedFreelistManager.h b/src/os/bluestore/ZonedFreelistManager.h index 0a389a48470..14ea9e39e76 100644 --- a/src/os/bluestore/ZonedFreelistManager.h +++ b/src/os/bluestore/ZonedFreelistManager.h @@ -2,9 +2,7 @@ // vim: ts=8 sw=2 smarttab // -// A freelist manager for zoned devices. This iteration just keeps the write -// pointer per zone. Following iterations will add enough information to enable -// cleaning of zones. +// A freelist manager for zoned devices. // // Copyright (C) 2020 Abutalib Aghayev // @@ -24,6 +22,8 @@ using cfg_reader_t = std::function; +const string CLEANING_IN_PROGRESS_KEY = "cleaning_in_progress"; + class ZonedFreelistManager : public FreelistManager { std::string meta_prefix; ///< device size, zone size, etc. std::string info_prefix; ///< per zone write pointer, dead bytes @@ -102,8 +102,11 @@ public: std::vector>*) const override; std::vector get_zone_states(KeyValueDB *kvdb) const; - void mark_zones_to_clean_free(const std::set *zones_to_clean, + std::set get_cleaning_in_progress_zones(KeyValueDB *kvdb) const; + void mark_zones_to_clean_free(const std::set& zones_to_clean, KeyValueDB *kvdb); + void mark_zones_to_clean_in_progress(const std::set& zones_to_clean, + KeyValueDB *kvdb); }; #endif