Merge pull request #19843 from ifed01/wip-ifed-bluestore-repair

os/bluestore: implement BlueStore repair

Reviewed-by: Radoslaw Zarzynski <rzarzyns@redhat.com>
This commit is contained in:
Kefu Chai 2018-02-24 12:15:40 +08:00 committed by GitHub
commit c8886cd77d
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
7 changed files with 1193 additions and 108 deletions

View File

@ -1078,6 +1078,7 @@ OPTION(bluestore_debug_omit_kv_commit, OPT_BOOL)
OPTION(bluestore_debug_permit_any_bdev_label, OPT_BOOL)
OPTION(bluestore_shard_finishers, OPT_BOOL)
OPTION(bluestore_debug_random_read_err, OPT_DOUBLE)
OPTION(bluestore_debug_inject_bug21040, OPT_BOOL)
OPTION(kstore_max_ops, OPT_U64)
OPTION(kstore_max_bytes, OPT_U64)

View File

@ -3943,6 +3943,9 @@ std::vector<Option> get_global_options() {
.set_default(0)
.set_description(""),
Option("bluestore_debug_inject_bug21040", Option::TYPE_BOOL, Option::LEVEL_DEV)
.set_default(false)
.set_description(""),
// -----------------------------------------
// kstore

File diff suppressed because it is too large Load Diff

View File

@ -33,6 +33,7 @@
#include "include/unordered_map.h"
#include "include/memory.h"
#include "include/mempool.h"
#include "common/bloom_filter.hpp"
#include "common/Finisher.h"
#include "common/perf_counters.h"
#include "compressor/Compressor.h"
@ -45,6 +46,7 @@
class Allocator;
class FreelistManager;
class BlueFS;
class BlueStoreRepairer;
//#define DEBUG_CACHE
//#define DEBUG_DEFERRED
@ -405,7 +407,7 @@ public:
/// put logical references, and get back any released extents
void put_ref(uint64_t offset, uint32_t length,
PExtentVector *r, set<SharedBlob*> *maybe_unshared_blobs);
PExtentVector *r, bool *unshare);
friend bool operator==(const SharedBlob &l, const SharedBlob &r) {
return l.get_sbid() == r.get_sbid();
@ -1460,6 +1462,14 @@ public:
int64_t& compressed_allocated() {
return values[STATFS_COMPRESSED_ALLOCATED];
}
volatile_statfs& operator=(const store_statfs_t& st) {
values[STATFS_ALLOCATED] = st.allocated;
values[STATFS_STORED] = st.stored;
values[STATFS_COMPRESSED_ORIGINAL] = st.compressed_original;
values[STATFS_COMPRESSED] = st.compressed;
values[STATFS_COMPRESSED_ALLOCATED] = st.compressed_allocated;
return *this;
}
bool is_empty() {
return values[STATFS_ALLOCATED] == 0 &&
values[STATFS_STORED] == 0 &&
@ -2049,11 +2059,13 @@ public:
private:
int _fsck_check_extents(
const coll_t& cid,
const ghobject_t& oid,
const PExtentVector& extents,
bool compressed,
mempool_dynamic_bitset &used_blocks,
uint64_t granularity,
BlueStoreRepairer* repairer,
store_statfs_t& expected_statfs);
void _buffer_cache_write(
@ -2085,6 +2097,8 @@ private:
uint64_t tail_pad,
bufferlist& padded);
void _record_onode(OnodeRef &o, KeyValueDB::Transaction &txn);
// -- ondisk version ---
public:
const int32_t latest_ondisk_format = 2; ///< our version
@ -2335,6 +2349,17 @@ public:
RWLock::WLocker l(debug_read_error_lock);
debug_mdata_error_objects.insert(o);
}
/// methods to inject various errors fsck can repair
void inject_broken_shared_blob_key(const string& key,
const bufferlist& bl);
void inject_leaked(uint64_t len);
void inject_false_free(coll_t cid, ghobject_t oid);
void inject_statfs(const store_statfs_t& new_statfs);
void inject_misreference(coll_t cid1, ghobject_t oid1,
coll_t cid2, ghobject_t oid2,
uint64_t offset);
void compact() override {
assert(db);
db->compact();
@ -2638,4 +2663,187 @@ static inline void intrusive_ptr_release(BlueStore::OpSequencer *o) {
o->put();
}
class BlueStoreRepairer
{
public:
// to simplify future potential migration to mempools
using fsck_interval = interval_set<uint64_t>;
// Structure to track what pextents are used for specific cid/oid.
// Similar to Bloom filter positive and false-positive matches are
// possible only.
// Maintains two lists of bloom filters for both cids and oids
// where each list entry is a BF for specific disk pextent
// The length of the extent per filter is measured on init.
// Allows to filter out 'uninteresting' pextents to speadup subsequent
// 'is_used' access.
struct StoreSpaceTracker {
const uint64_t BLOOM_FILTER_SALT_COUNT = 2;
const uint64_t BLOOM_FILTER_TABLE_SIZE = 32; // bytes per single filter
const uint64_t BLOOM_FILTER_EXPECTED_COUNT = 16; // arbitrary selected
static const uint64_t DEF_MEM_CAP = 128 * 1024 * 1024;
typedef mempool::bluestore_fsck::vector<bloom_filter> bloom_vector;
bloom_vector collections_bfs;
bloom_vector objects_bfs;
bool was_filtered_out = false;
uint64_t granularity = 0; // extent length for a single filter
StoreSpaceTracker() {
}
StoreSpaceTracker(const StoreSpaceTracker& from) :
collections_bfs(from.collections_bfs),
objects_bfs(from.objects_bfs),
granularity(from.granularity) {
}
void init(uint64_t total,
uint64_t min_alloc_size,
uint64_t mem_cap = DEF_MEM_CAP) {
assert(!granularity); // not initialized yet
assert(min_alloc_size && isp2(min_alloc_size));
assert(mem_cap);
total = ROUND_UP_TO(total, min_alloc_size);
granularity = total * BLOOM_FILTER_TABLE_SIZE * 2 / mem_cap;
if (!granularity) {
granularity = min_alloc_size;
} else {
granularity = ROUND_UP_TO(granularity, min_alloc_size);
}
uint64_t entries = P2ROUNDUP(total, granularity) / granularity;
collections_bfs.resize(entries,
bloom_filter(BLOOM_FILTER_SALT_COUNT,
BLOOM_FILTER_TABLE_SIZE,
0,
BLOOM_FILTER_EXPECTED_COUNT));
objects_bfs.resize(entries,
bloom_filter(BLOOM_FILTER_SALT_COUNT,
BLOOM_FILTER_TABLE_SIZE,
0,
BLOOM_FILTER_EXPECTED_COUNT));
}
inline uint32_t get_hash(const coll_t& cid) const {
return cid.hash_to_shard(1);
}
inline void set_used(uint64_t offset, uint64_t len,
const coll_t& cid, const ghobject_t& oid) {
assert(granularity); // initialized
// can't call this func after filter_out has been apllied
assert(!was_filtered_out);
if (!len) {
return;
}
auto pos = offset / granularity;
auto end_pos = (offset + len - 1) / granularity;
while (pos <= end_pos) {
collections_bfs[pos].insert(get_hash(cid));
objects_bfs[pos].insert(oid.hobj.get_hash());
++pos;
}
}
// filter-out entries unrelated to the specified(broken) extents.
// 'is_used' calls are permitted after that only
size_t filter_out(const fsck_interval& extents);
// determines if collection's present after filtering-out
inline bool is_used(const coll_t& cid) const {
assert(was_filtered_out);
for(auto& bf : collections_bfs) {
if (bf.contains(get_hash(cid))) {
return true;
}
}
return false;
}
// determines if object's present after filtering-out
inline bool is_used(const ghobject_t& oid) const {
assert(was_filtered_out);
for(auto& bf : objects_bfs) {
if (bf.contains(oid.hobj.get_hash())) {
return true;
}
}
return false;
}
// determines if collection's present before filtering-out
inline bool is_used(const coll_t& cid, uint64_t offs) const {
assert(granularity); // initialized
assert(!was_filtered_out);
auto &bf = collections_bfs[offs / granularity];
if (bf.contains(get_hash(cid))) {
return true;
}
return false;
}
// determines if object's present before filtering-out
inline bool is_used(const ghobject_t& oid, uint64_t offs) const {
assert(granularity); // initialized
assert(!was_filtered_out);
auto &bf = objects_bfs[offs / granularity];
if (bf.contains(oid.hobj.get_hash())) {
return true;
}
return false;
}
};
public:
bool remove_key(KeyValueDB *db, const string& prefix, const string& key);
bool fix_shared_blob(KeyValueDB *db,
uint64_t sbid,
const bufferlist* bl);
bool fix_statfs(KeyValueDB *db, const store_statfs_t& new_statfs);
bool fix_leaked(KeyValueDB *db,
FreelistManager* fm,
uint64_t offset, uint64_t len);
bool fix_false_free(KeyValueDB *db,
FreelistManager* fm,
uint64_t offset, uint64_t len);
void init(uint64_t total_space, uint64_t lres_tracking_unit_size);
bool preprocess_misreference(KeyValueDB *db);
unsigned apply(KeyValueDB* db);
void note_misreference(uint64_t offs, uint64_t len, bool inc_error) {
misreferenced_extents.insert(offs, len);
if (inc_error) {
++to_repair_cnt;
}
}
StoreSpaceTracker& get_space_usage_tracker() {
return space_usage_tracker;
}
const fsck_interval& get_misreferences() const {
return misreferenced_extents;
}
KeyValueDB::Transaction get_fix_misreferences_txn() {
return fix_misreferences_txn;
}
private:
unsigned to_repair_cnt = 0;
KeyValueDB::Transaction fix_fm_leaked_txn;
KeyValueDB::Transaction fix_fm_false_free_txn;
KeyValueDB::Transaction remove_key_txn;
KeyValueDB::Transaction fix_statfs_txn;
KeyValueDB::Transaction fix_shared_blob_txn;
KeyValueDB::Transaction fix_misreferences_txn;
StoreSpaceTracker space_usage_tracker;
// non-shared extents with multiple references
fsck_interval misreferenced_extents;
};
#endif

View File

@ -444,6 +444,9 @@ public:
const PExtentVector& get_extents() const {
return extents;
}
PExtentVector& dirty_extents() {
return extents;
}
DENC_HELPERS;
void bound_encode(size_t& p, uint64_t struct_v) const {
@ -852,6 +855,9 @@ struct bluestore_shared_blob_t {
bluestore_extent_ref_map_t ref_map; ///< shared blob extents
bluestore_shared_blob_t(uint64_t _sbid) : sbid(_sbid) {}
bluestore_shared_blob_t(uint64_t _sbid,
bluestore_extent_ref_map_t&& _ref_map )
: sbid(_sbid), ref_map(std::move(_ref_map)) {}
DENC(bluestore_shared_blob_t, v, p) {
DENC_START(1, 1, p);

View File

@ -6907,6 +6907,167 @@ TEST_P(StoreTestSpecificAUSize, fsckOnUnalignedDevice2) {
g_conf->apply_changes(NULL);
}
TEST_P(StoreTest, BluestoreRepairTest) {
if (string(GetParam()) != "bluestore")
return;
const size_t offs_base = 65536 / 2;
g_ceph_context->_conf->set_val("bluestore_fsck_on_mount", "false");
g_ceph_context->_conf->set_val("bluestore_fsck_on_umount", "false");
g_ceph_context->_conf->set_val("bluestore_max_blob_size", stringify(2 * offs_base));
g_ceph_context->_conf->set_val("bluestore_extent_map_shard_max_size", "12000");
g_ceph_context->_conf->apply_changes(NULL);
BlueStore* bstore = dynamic_cast<BlueStore*> (store.get());
// fill the store with some data
coll_t cid(spg_t(pg_t(0,555), shard_id_t::NO_SHARD));
auto ch = store->create_new_collection(cid);
ghobject_t hoid(hobject_t(sobject_t("Object 1", CEPH_NOSNAP)));
ghobject_t hoid_dup(hobject_t(sobject_t("Object 1(dup)", CEPH_NOSNAP)));
ghobject_t hoid2(hobject_t(sobject_t("Object 2", CEPH_NOSNAP)));
ghobject_t hoid_cloned = hoid2;
hoid_cloned.hobj.snap = 1;
ghobject_t hoid3(hobject_t(sobject_t("Object 3", CEPH_NOSNAP)));
ghobject_t hoid3_cloned = hoid3;
hoid3_cloned.hobj.snap = 1;
bufferlist bl;
bl.append("1234512345");
int r;
const size_t repeats = 16;
{
auto ch = store->create_new_collection(cid);
cerr << "create collection + write" << std::endl;
ObjectStore::Transaction t;
t.create_collection(cid, 0);
for( auto i = 0ul; i < repeats; ++i ) {
t.write(cid, hoid, i * offs_base, bl.length(), bl);
t.write(cid, hoid_dup, i * offs_base, bl.length(), bl);
}
for( auto i = 0ul; i < repeats; ++i ) {
t.write(cid, hoid2, i * offs_base, bl.length(), bl);
}
t.clone(cid, hoid2, hoid_cloned);
r = queue_transaction(store, ch, std::move(t));
ASSERT_EQ(r, 0);
}
bstore->umount();
//////////// leaked pextent fix ////////////
cerr << "fix leaked pextents" << std::endl;
ASSERT_EQ(bstore->fsck(false), 0);
ASSERT_EQ(bstore->repair(false), 0);
bstore->mount();
bstore->inject_leaked(0x30000);
bstore->umount();
ASSERT_EQ(bstore->fsck(false), 1);
ASSERT_EQ(bstore->repair(false), 0);
ASSERT_EQ(bstore->fsck(false), 0);
//////////// false free fix ////////////
cerr << "fix false free pextents" << std::endl;
bstore->mount();
bstore->inject_false_free(cid, hoid);
bstore->umount();
ASSERT_EQ(bstore->fsck(false), 2);
ASSERT_EQ(bstore->repair(false), 0);
ASSERT_EQ(bstore->fsck(false), 0);
//////////// verify invalid statfs ///////////
cerr << "fix invalid statfs" << std::endl;
store_statfs_t statfs0, statfs;
bstore->mount();
ASSERT_EQ(bstore->statfs(&statfs0), 0);
statfs = statfs0;
statfs.allocated += 0x10000;
statfs.stored += 0x10000;
ASSERT_FALSE(statfs0 == statfs);
bstore->inject_statfs(statfs);
bstore->umount();
ASSERT_EQ(bstore->fsck(false), 1);
ASSERT_EQ(bstore->repair(false), 0);
ASSERT_EQ(bstore->fsck(false), 0);
ASSERT_EQ(bstore->mount(), 0);
ASSERT_EQ(bstore->statfs(&statfs), 0);
// adjust free space to success in comparison
statfs0.available = statfs.available;
ASSERT_EQ(statfs0, statfs);
///////// undecodable shared blob key / stray shared blob records ///////
cerr << "undecodable shared blob key" << std::endl;
bstore->inject_broken_shared_blob_key("undec1",
bufferlist());
bstore->inject_broken_shared_blob_key("undecodable key 2",
bufferlist());
bstore->inject_broken_shared_blob_key("undecodable key 3",
bufferlist());
bstore->umount();
ASSERT_EQ(bstore->fsck(false), 3);
ASSERT_EQ(bstore->repair(false), 0);
ASSERT_EQ(bstore->fsck(false), 0);
cerr << "misreferencing" << std::endl;
bstore->mount();
bstore->inject_misreference(cid, hoid, cid, hoid_dup, 0);
bstore->inject_misreference(cid, hoid, cid, hoid_dup, (offs_base * repeats) / 2);
bstore->inject_misreference(cid, hoid, cid, hoid_dup, offs_base * (repeats -1) );
bstore->umount();
ASSERT_EQ(bstore->fsck(false), 6);
ASSERT_EQ(bstore->repair(false), 0);
ASSERT_EQ(bstore->fsck(true), 0);
// reproducing issues #21040 & 20983
g_ceph_context->_conf->set_val(
"bluestore_debug_inject_bug21040", "true");
g_ceph_context->_conf->apply_changes(NULL);
bstore->mount();
cerr << "repro bug #21040" << std::endl;
{
auto ch = store->open_collection(cid);
{
ObjectStore::Transaction t;
bl.append("0123456789012345");
t.write(cid, hoid3, offs_base, bl.length(), bl);
bl.clear();
bl.append('!');
t.write(cid, hoid3, 0, bl.length(), bl);
r = queue_transaction(store, ch, std::move(t));
ASSERT_EQ(r, 0);
}
{
ObjectStore::Transaction t;
t.clone(cid, hoid3, hoid3_cloned);
r = queue_transaction(store, ch, std::move(t));
ASSERT_EQ(r, 0);
}
bstore->umount();
ASSERT_EQ(bstore->fsck(false), 3);
ASSERT_LE(bstore->repair(false), 0);
ASSERT_EQ(bstore->fsck(false), 0);
g_ceph_context->_conf->set_val(
"bluestore_debug_inject_bug21040", "true");
g_ceph_context->_conf->apply_changes(NULL);
}
cerr << "Completing" << std::endl;
bstore->mount();
g_ceph_context->_conf->set_val("bluestore_fsck_on_mount", "true");
g_ceph_context->_conf->set_val("bluestore_fsck_on_umount", "true");
g_ceph_context->_conf->set_val("bluestore_max_alloc_size", "0");
g_ceph_context->_conf->set_val("bluestore_extent_map_shard_max_size", "1200");
g_ceph_context->_conf->apply_changes(NULL);
}
int main(int argc, char **argv) {
vector<const char*> args;
argv_to_vec(argc, (const char **)argv, args);

View File

@ -1441,7 +1441,95 @@ TEST(GarbageCollector, BasicTest)
em.clear();
old_extents.clear();
}
}
}
TEST(BlueStoreRepairer, StoreSpaceTracker)
{
BlueStoreRepairer::StoreSpaceTracker bmap0;
bmap0.init((uint64_t)4096 * 1024 * 1024 * 1024, 0x1000);
ASSERT_EQ(bmap0.granularity, 2 * 1024 * 1024);
ASSERT_EQ(bmap0.collections_bfs.size(), 2048 * 1024);
ASSERT_EQ(bmap0.objects_bfs.size(), 2048 * 1024);
BlueStoreRepairer::StoreSpaceTracker bmap;
bmap.init(0x2000 * 0x1000 - 1, 0x1000, 512 * 1024);
ASSERT_EQ(bmap.granularity, 0x1000);
ASSERT_EQ(bmap.collections_bfs.size(), 0x2000);
ASSERT_EQ(bmap.objects_bfs.size(), 0x2000);
coll_t cid;
ghobject_t hoid;
ASSERT_FALSE(bmap.is_used(cid, 0));
ASSERT_FALSE(bmap.is_used(hoid, 0));
bmap.set_used(0, 1, cid, hoid);
ASSERT_TRUE(bmap.is_used(cid, 0));
ASSERT_TRUE(bmap.is_used(hoid, 0));
ASSERT_FALSE(bmap.is_used(cid, 0x1023));
ASSERT_FALSE(bmap.is_used(hoid, 0x1023));
ASSERT_FALSE(bmap.is_used(cid, 0x2023));
ASSERT_FALSE(bmap.is_used(hoid, 0x2023));
ASSERT_FALSE(bmap.is_used(cid, 0x3023));
ASSERT_FALSE(bmap.is_used(hoid, 0x3023));
bmap.set_used(0x1023, 0x3000, cid, hoid);
ASSERT_TRUE(bmap.is_used(cid, 0x1023));
ASSERT_TRUE(bmap.is_used(hoid, 0x1023));
ASSERT_TRUE(bmap.is_used(cid, 0x2023));
ASSERT_TRUE(bmap.is_used(hoid, 0x2023));
ASSERT_TRUE(bmap.is_used(cid, 0x3023));
ASSERT_TRUE(bmap.is_used(hoid, 0x3023));
ASSERT_FALSE(bmap.is_used(cid, 0x9001));
ASSERT_FALSE(bmap.is_used(hoid, 0x9001));
ASSERT_FALSE(bmap.is_used(cid, 0xa001));
ASSERT_FALSE(bmap.is_used(hoid, 0xa001));
ASSERT_FALSE(bmap.is_used(cid, 0xb000));
ASSERT_FALSE(bmap.is_used(hoid, 0xb000));
ASSERT_FALSE(bmap.is_used(cid, 0xc000));
ASSERT_FALSE(bmap.is_used(hoid, 0xc000));
bmap.set_used(0x9001, 0x2fff, cid, hoid);
ASSERT_TRUE(bmap.is_used(cid, 0x9001));
ASSERT_TRUE(bmap.is_used(hoid, 0x9001));
ASSERT_TRUE(bmap.is_used(cid, 0xa001));
ASSERT_TRUE(bmap.is_used(hoid, 0xa001));
ASSERT_TRUE(bmap.is_used(cid, 0xb001));
ASSERT_TRUE(bmap.is_used(hoid, 0xb001));
ASSERT_FALSE(bmap.is_used(cid, 0xc000));
ASSERT_FALSE(bmap.is_used(hoid, 0xc000));
bmap.set_used(0xa001, 0x2, cid, hoid);
ASSERT_TRUE(bmap.is_used(cid, 0x9001));
ASSERT_TRUE(bmap.is_used(hoid, 0x9001));
ASSERT_TRUE(bmap.is_used(cid, 0xa001));
ASSERT_TRUE(bmap.is_used(hoid, 0xa001));
ASSERT_TRUE(bmap.is_used(cid, 0xb001));
ASSERT_TRUE(bmap.is_used(hoid, 0xb001));
ASSERT_FALSE(bmap.is_used(cid, 0xc000));
ASSERT_FALSE(bmap.is_used(hoid, 0xc000));
ASSERT_FALSE(bmap.is_used(cid, 0xc0000));
ASSERT_FALSE(bmap.is_used(hoid, 0xc0000));
ASSERT_FALSE(bmap.is_used(cid, 0xc1000));
ASSERT_FALSE(bmap.is_used(hoid, 0xc1000));
bmap.set_used(0xc0000, 0x2000, cid, hoid);
ASSERT_TRUE(bmap.is_used(cid, 0xc0000));
ASSERT_TRUE(bmap.is_used(hoid, 0xc0000));
ASSERT_TRUE(bmap.is_used(cid, 0xc1000));
ASSERT_TRUE(bmap.is_used(hoid, 0xc1000));
interval_set<uint64_t> extents;
extents.insert(0,0x500);
extents.insert(0x800,0x100);
extents.insert(0x1000,0x1000);
extents.insert(0xa001,1);
extents.insert(0xa0000,0xff8);
ASSERT_EQ(bmap.filter_out(extents), 3);
ASSERT_TRUE(bmap.is_used(cid));
ASSERT_TRUE(bmap.is_used(hoid));
}
int main(int argc, char **argv) {
vector<const char*> args;