Merge pull request #32758 from ifed01/wip-ifed-fix-legacy-omap

os/bluestore: upgrade legacy omap to per-pool format automatically.

Reviewed-by: Sage Weil <sage@redhat.com>
This commit is contained in:
Kefu Chai 2020-01-26 11:55:21 +08:00 committed by GitHub
commit 42cf0226bd
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 175 additions and 144 deletions

View File

@ -6101,6 +6101,20 @@ void BlueStore::_fsck_collections(int64_t* errors)
} }
} }
void BlueStore::_set_per_pool_omap()
{
per_pool_omap = false;
bufferlist bl;
db->get(PREFIX_SUPER, "per_pool_omap", &bl);
if (bl.length()) {
per_pool_omap = true;
dout(10) << __func__ << " per_pool_omap=1" << dendl;
} else {
dout(10) << __func__ << " per_pool_omap not present" << dendl;
}
_check_no_per_pool_omap_alert();
}
void BlueStore::_open_statfs() void BlueStore::_open_statfs()
{ {
osd_pools.clear(); osd_pools.clear();
@ -6964,16 +6978,23 @@ int BlueStore::_mount(bool kv_only, bool open_db)
mempool_thread.init(); mempool_thread.init();
if (!per_pool_stat_collection && if ((!per_pool_stat_collection || !per_pool_omap) &&
cct->_conf->bluestore_fsck_quick_fix_on_mount == true) { cct->_conf->bluestore_fsck_quick_fix_on_mount == true) {
bool was_per_pool_omap = per_pool_omap;
dout(1) << __func__ << " quick-fix on mount" << dendl; dout(1) << __func__ << " quick-fix on mount" << dendl;
_fsck_on_open(FSCK_SHALLOW, true); _fsck_on_open(FSCK_SHALLOW, true);
//reread statfs //reread statfs
//FIXME minor: replace with actual open/close? //FIXME minor: replace with actual open/close?
_open_statfs(); _open_statfs();
_check_legacy_statfs_alert(); _check_legacy_statfs_alert();
//set again as hopefully it has been fixed
if (!was_per_pool_omap) {
_set_per_pool_omap();
}
} }
mounted = true; mounted = true;
@ -7069,6 +7090,19 @@ int BlueStore::cold_close()
return 0; return 0;
} }
// derr wrapper to limit enormous output and avoid log flooding.
// Of limited use where such output is expected for now
#define fsck_derr(err_cnt, threshold) \
if (err_cnt <= threshold) { \
bool need_skip_print = err_cnt == threshold; \
derr
#define fsck_dendl \
dendl; \
if (need_skip_print) \
derr << "more error lines skipped..." << dendl; \
}
int _fsck_sum_extents( int _fsck_sum_extents(
const PExtentVector& extents, const PExtentVector& extents,
bool compressed, bool compressed,
@ -7239,7 +7273,6 @@ void BlueStore::_fsck_check_pool_statfs(
++errors; ++errors;
} }
if (!per_pool_stat_collection && if (!per_pool_stat_collection &&
cct->_conf->bluestore_fsck_error_on_no_per_pool_stats &&
repairer) { repairer) {
// by virtue of running this method, we correct the top-level // by virtue of running this method, we correct the top-level
// error of having global stats // error of having global stats
@ -7428,6 +7461,11 @@ BlueStore::OnodeRef BlueStore::fsck_check_objects_shallow(
*res_statfs); *res_statfs);
} }
} // for (auto& i : ref_map) } // for (auto& i : ref_map)
if (o->onode.has_omap()) {
_fsck_check_object_omap(depth, o, ctx);
}
return o; return o;
} }
@ -7551,9 +7589,7 @@ public:
batch->num_sharded_objects, batch->num_sharded_objects,
batch->num_spanning_blobs, batch->num_spanning_blobs,
nullptr, // used_blocks nullptr, // used_blocks
nullptr, // used_omap_head; nullptr, //used_omap_head
nullptr, // used_per_pool_omap_head;
nullptr, // used_pgmeta_omap_head;
sb_info_lock, sb_info_lock,
*sb_info, *sb_info,
batch->expected_store_statfs, batch->expected_store_statfs,
@ -7664,6 +7700,7 @@ public:
ctx.num_blobs += batch.num_blobs; ctx.num_blobs += batch.num_blobs;
ctx.num_sharded_objects += batch.num_sharded_objects; ctx.num_sharded_objects += batch.num_sharded_objects;
ctx.num_spanning_blobs += batch.num_spanning_blobs; ctx.num_spanning_blobs += batch.num_spanning_blobs;
ctx.expected_store_statfs.add(batch.expected_store_statfs); ctx.expected_store_statfs.add(batch.expected_store_statfs);
for (auto it = batch.expected_pool_statfs.begin(); for (auto it = batch.expected_pool_statfs.begin();
@ -7676,20 +7713,92 @@ public:
}; };
}; };
void BlueStore::_fsck_check_object_omap(FSCKDepth depth,
OnodeRef& o,
const BlueStore::FSCK_ObjectCtx& ctx)
{
auto& errors = ctx.errors;
auto& warnings = ctx.warnings;
auto repairer = ctx.repairer;
ceph_assert(o->onode.has_omap());
if (!o->onode.is_perpool_omap() && !o->onode.is_pgmeta_omap()) {
if (per_pool_omap) {
fsck_derr(errors, MAX_FSCK_ERROR_LINES)
<< "fsck error: " << o->oid
<< " has omap that is not per-pool or pgmeta"
<< fsck_dendl;
++errors;
} else {
const char* w;
int64_t num;
if (cct->_conf->bluestore_fsck_error_on_no_per_pool_omap) {
++errors;
num = errors;
w = "error";
} else {
++warnings;
num = warnings;
w = "warning";
}
fsck_derr(num, MAX_FSCK_ERROR_LINES)
<< "fsck " << w << ": " << o->oid
<< " has omap that is not per-pool or pgmeta"
<< fsck_dendl;
}
}
if (repairer &&
!o->onode.is_perpool_omap() &&
!o->onode.is_pgmeta_omap()) {
dout(10) << "fsck converting " << o->oid << " omap to per-pool" << dendl;
bufferlist h;
map<string, bufferlist> kv;
int r = _omap_get(o->c, o->oid, &h, &kv);
if (r < 0) {
derr << " got " << r << " " << cpp_strerror(r) << dendl;
} else {
KeyValueDB::Transaction txn = db->get_transaction();
// remove old keys
const string& old_omap_prefix = o->get_omap_prefix();
string old_head, old_tail;
o->get_omap_header(&old_head);
o->get_omap_tail(&old_tail);
txn->rm_range_keys(old_omap_prefix, old_head, old_tail);
txn->rmkey(old_omap_prefix, old_tail);
// set flag
o->onode.set_flag(bluestore_onode_t::FLAG_PERPOOL_OMAP);
_record_onode(o, txn);
const string& new_omap_prefix = o->get_omap_prefix();
// head
if (h.length()) {
string new_head;
o->get_omap_header(&new_head);
txn->set(new_omap_prefix, new_head, h);
}
// tail
string new_tail;
o->get_omap_tail(&new_tail);
bufferlist empty;
txn->set(new_omap_prefix, new_tail, empty);
// values
string final_key;
o->get_omap_key(string(), &final_key);
size_t base_key_len = final_key.size();
for (auto& i : kv) {
final_key.resize(base_key_len);
final_key += i.first;
txn->set(new_omap_prefix, final_key, i.second);
}
db->submit_transaction_sync(txn);
repairer->inc_repaired();
}
}
}
void BlueStore::_fsck_check_objects(FSCKDepth depth, void BlueStore::_fsck_check_objects(FSCKDepth depth,
BlueStore::FSCK_ObjectCtx& ctx) BlueStore::FSCK_ObjectCtx& ctx)
{ {
//no need for the below lock when in non-shallow mode as
// there is no multithreading in this case
if (depth != FSCK_SHALLOW) {
ctx.sb_info_lock = nullptr;
}
auto& errors = ctx.errors; auto& errors = ctx.errors;
auto& warnings = ctx.warnings;
auto used_omap_head = ctx.used_omap_head;
auto used_per_pool_omap_head = ctx.used_per_pool_omap_head;
auto used_pgmeta_omap_head = ctx.used_pgmeta_omap_head;
auto sb_info_lock = ctx.sb_info_lock; auto sb_info_lock = ctx.sb_info_lock;
auto& sb_info = ctx.sb_info; auto& sb_info = ctx.sb_info;
auto repairer = ctx.repairer; auto repairer = ctx.repairer;
@ -7897,91 +8006,15 @@ void BlueStore::_fsck_check_objects(FSCKDepth depth,
} }
// omap // omap
if (o->onode.has_omap()) { if (o->onode.has_omap()) {
ceph_assert(used_omap_head); ceph_assert(ctx.used_omap_head);
ceph_assert(used_per_pool_omap_head); if (ctx.used_omap_head->count(o->onode.nid)) {
ceph_assert(used_pgmeta_omap_head); derr << "fsck error: " << o->oid << " omap_head " << o->onode.nid
auto m = << " already in use" << dendl;
o->onode.is_pgmeta_omap() ? used_pgmeta_omap_head :
(o->onode.is_perpool_omap() ? used_per_pool_omap_head : used_omap_head);
if (m->count(o->onode.nid)) {
derr << "fsck error: " << oid << " omap_head " << o->onode.nid
<< " already in use" << dendl;
++errors; ++errors;
} else {
ctx.used_omap_head->insert(o->onode.nid);
} }
else { } // if (o->onode.has_omap())
m->insert(o->onode.nid);
}
if (!o->onode.is_perpool_omap() && !o->onode.is_pgmeta_omap()) {
if (per_pool_omap) {
derr << "fsck error: " << oid
<< " has omap that is not per-pool or pgmeta" << dendl;
++errors;
}
else {
const char* w;
if (cct->_conf->bluestore_fsck_error_on_no_per_pool_omap) {
++errors;
w = "error";
}
else {
++warnings;
w = "warning";
}
derr << "fsck " << w << ": " << oid
<< " has omap that is not per-pool or pgmeta" << dendl;
}
}
if (repairer &&
o->onode.has_omap() &&
!o->onode.is_perpool_omap() &&
!o->oid.is_pgmeta()) {
derr << "fsck converting " << oid << " omap to per-pool" << dendl;
used_omap_head->erase(o->onode.nid);
used_per_pool_omap_head->insert(o->onode.nid);
bufferlist h;
map<string, bufferlist> kv;
int r = _omap_get(c.get(), oid, &h, &kv);
if (r < 0) {
derr << " got " << r << " " << cpp_strerror(r) << dendl;
}
else {
KeyValueDB::Transaction txn = db->get_transaction();
// remove old keys
const string& old_omap_prefix = o->get_omap_prefix();
string old_head, old_tail;
o->get_omap_header(&old_head);
o->get_omap_tail(&old_tail);
txn->rm_range_keys(old_omap_prefix, old_head, old_tail);
txn->rmkey(old_omap_prefix, old_tail);
// set flag
o->onode.set_flag(bluestore_onode_t::FLAG_PERPOOL_OMAP);
_record_onode(o, txn);
const string& new_omap_prefix = o->get_omap_prefix();
// head
if (h.length()) {
string new_head;
o->get_omap_header(&new_head);
txn->set(new_omap_prefix, new_head, h);
}
// tail
string new_tail;
o->get_omap_tail(&new_tail);
bufferlist empty;
txn->set(new_omap_prefix, new_tail, empty);
// values
string final_key;
o->get_omap_key(string(), &final_key);
size_t base_key_len = final_key.size();
for (auto& i : kv) {
final_key.resize(base_key_len);
final_key += i.first;
txn->set(new_omap_prefix, final_key, i.second);
}
db->submit_transaction_sync(txn);
repairer->inc_repaired();
}
}
} // if (depth != FSCK_SHALLOW && o->onode.has_omap())
if (depth == FSCK_DEEP) { if (depth == FSCK_DEEP) {
bufferlist bl; bufferlist bl;
uint64_t max_read_block = cct->_conf->bluestore_fsck_read_bytes_cap; uint64_t max_read_block = cct->_conf->bluestore_fsck_read_bytes_cap;
@ -8140,8 +8173,6 @@ int BlueStore::_fsck_on_open(BlueStore::FSCKDepth depth, bool repair)
unsigned repaired = 0; unsigned repaired = 0;
uint64_t_btree_t used_omap_head; uint64_t_btree_t used_omap_head;
uint64_t_btree_t used_per_pool_omap_head;
uint64_t_btree_t used_pgmeta_omap_head;
uint64_t_btree_t used_sbids; uint64_t_btree_t used_sbids;
mempool_dynamic_bitset used_blocks; mempool_dynamic_bitset used_blocks;
@ -8270,15 +8301,15 @@ int BlueStore::_fsck_on_open(BlueStore::FSCKDepth depth, bool repair)
num_spanning_blobs, num_spanning_blobs,
&used_blocks, &used_blocks,
&used_omap_head, &used_omap_head,
&used_per_pool_omap_head, //no need for the below lock when in non-shallow mode as
&used_pgmeta_omap_head, // there is no multithreading in this case
&sb_info_lock, depth == FSCK_SHALLOW ? &sb_info_lock : nullptr,
sb_info, sb_info,
expected_store_statfs, expected_store_statfs,
expected_pool_statfs, expected_pool_statfs,
repair ? &repairer : nullptr); repair ? &repairer : nullptr);
_fsck_check_objects(depth,
ctx); _fsck_check_objects(depth, ctx);
} }
dout(1) << __func__ << " checking shared_blobs" << dendl; dout(1) << __func__ << " checking shared_blobs" << dendl;
@ -8602,7 +8633,7 @@ int BlueStore::_fsck_on_open(BlueStore::FSCKDepth depth, bool repair)
errors, warnings, repair ? &repairer : nullptr); errors, warnings, repair ? &repairer : nullptr);
if (depth != FSCK_SHALLOW) { if (depth != FSCK_SHALLOW) {
dout(1) << __func__ << " checking for stray omap data" << dendl; dout(1) << __func__ << " checking for stray omap data " << dendl;
it = db->get_iterator(PREFIX_OMAP); it = db->get_iterator(PREFIX_OMAP);
if (it) { if (it) {
uint64_t last_omap_head = 0; uint64_t last_omap_head = 0;
@ -8611,8 +8642,9 @@ int BlueStore::_fsck_on_open(BlueStore::FSCKDepth depth, bool repair)
_key_decode_u64(it->key().c_str(), &omap_head); _key_decode_u64(it->key().c_str(), &omap_head);
if (used_omap_head.count(omap_head) == 0 && if (used_omap_head.count(omap_head) == 0 &&
omap_head != last_omap_head) { omap_head != last_omap_head) {
derr << "fsck error: found stray omap data on omap_head " fsck_derr(errors, MAX_FSCK_ERROR_LINES)
<< omap_head << dendl; << "fsck error: found stray omap data on omap_head "
<< omap_head << " " << last_omap_head << " " << used_omap_head.count(omap_head)<< fsck_dendl;
++errors; ++errors;
last_omap_head = omap_head; last_omap_head = omap_head;
} }
@ -8624,11 +8656,12 @@ int BlueStore::_fsck_on_open(BlueStore::FSCKDepth depth, bool repair)
for (it->lower_bound(string()); it->valid(); it->next()) { for (it->lower_bound(string()); it->valid(); it->next()) {
uint64_t omap_head; uint64_t omap_head;
_key_decode_u64(it->key().c_str(), &omap_head); _key_decode_u64(it->key().c_str(), &omap_head);
if (used_pgmeta_omap_head.count(omap_head) == 0 && if (used_omap_head.count(omap_head) == 0 &&
omap_head != last_omap_head) { omap_head != last_omap_head) {
derr << "fsck error: found stray (pgmeta) omap data on omap_head " fsck_derr(errors, MAX_FSCK_ERROR_LINES)
<< omap_head << dendl; << "fsck error: found stray (pgmeta) omap data on omap_head "
last_omap_head = omap_head; << omap_head << " " << last_omap_head << " " << used_omap_head.count(omap_head) << fsck_dendl;
last_omap_head = omap_head;
++errors; ++errors;
} }
} }
@ -8643,11 +8676,12 @@ int BlueStore::_fsck_on_open(BlueStore::FSCKDepth depth, bool repair)
const char *c = k.c_str(); const char *c = k.c_str();
c = _key_decode_u64(c, &pool); c = _key_decode_u64(c, &pool);
c = _key_decode_u64(c, &omap_head); c = _key_decode_u64(c, &omap_head);
if (used_per_pool_omap_head.count(omap_head) == 0 && if (used_omap_head.count(omap_head) == 0 &&
omap_head != last_omap_head) { omap_head != last_omap_head) {
derr << "fsck error: found stray (per-pool) omap data on omap_head " fsck_derr(errors, MAX_FSCK_ERROR_LINES)
<< omap_head << dendl; << "fsck error: found stray (per-pool) omap data on omap_head "
++errors; << omap_head << " " << last_omap_head << " " << used_omap_head.count(omap_head) << fsck_dendl;
++errors;
last_omap_head = omap_head; last_omap_head = omap_head;
} }
} }
@ -8769,8 +8803,7 @@ int BlueStore::_fsck_on_open(BlueStore::FSCKDepth depth, bool repair)
} }
} }
if (repair) { if (repair) {
if (!per_pool_omap && if (!per_pool_omap) {
depth != FSCK_SHALLOW) {
dout(5) << __func__ << " marking per_pool_omap=1" << dendl; dout(5) << __func__ << " marking per_pool_omap=1" << dendl;
repairer.fix_per_pool_omap(db); repairer.fix_per_pool_omap(db);
} }
@ -8796,7 +8829,12 @@ out_scan:
<< repaired << " repaired, " << repaired << " repaired, "
<< (errors + warnings - (int)repaired) << " remaining in " << (errors + warnings - (int)repaired) << " remaining in "
<< duration << " seconds" << dendl; << duration << " seconds" << dendl;
return errors - (int)repaired;
// In non-repair mode we should return error count only as
// it indicates if store status is OK.
// In repair mode both errors and warnings are taken into account
// since repaired counter relates to them both.
return repair ? errors + warnings - (int)repaired : errors;
} }
/// methods to inject various errors fsck can repair /// methods to inject various errors fsck can repair
@ -10840,17 +10878,7 @@ int BlueStore::_open_super_meta()
<< std::dec << dendl; << std::dec << dendl;
} }
{ _set_per_pool_omap();
bufferlist bl;
db->get(PREFIX_SUPER, "per_pool_omap", &bl);
if (bl.length()) {
per_pool_omap = true;
dout(10) << __func__ << " per_pool_omap=1" << dendl;
} else {
dout(10) << __func__ << " per_pool_omap not present" << dendl;
}
_check_no_per_pool_omap_alert();
}
_open_statfs(); _open_statfs();
_set_alloc_sizes(); _set_alloc_sizes();

View File

@ -2229,6 +2229,7 @@ private:
void _set_alloc_sizes(); void _set_alloc_sizes();
void _set_blob_size(); void _set_blob_size();
void _set_finisher_num(); void _set_finisher_num();
void _set_per_pool_omap();
void _update_osd_memory_options(); void _update_osd_memory_options();
int _open_bdev(bool create); int _open_bdev(bool create);
@ -2371,6 +2372,9 @@ public:
FSCK_DEEP, FSCK_DEEP,
FSCK_SHALLOW FSCK_SHALLOW
}; };
enum {
MAX_FSCK_ERROR_LINES = 100,
};
private: private:
int _fsck_check_extents( int _fsck_check_extents(
@ -3237,8 +3241,6 @@ public:
mempool_dynamic_bitset* used_blocks; mempool_dynamic_bitset* used_blocks;
uint64_t_btree_t* used_omap_head; uint64_t_btree_t* used_omap_head;
uint64_t_btree_t* used_per_pool_omap_head;
uint64_t_btree_t* used_pgmeta_omap_head;
ceph::mutex* sb_info_lock; ceph::mutex* sb_info_lock;
sb_info_map_t& sb_info; sb_info_map_t& sb_info;
@ -3256,8 +3258,6 @@ public:
uint64_t& _num_spanning_blobs, uint64_t& _num_spanning_blobs,
mempool_dynamic_bitset* _ub, mempool_dynamic_bitset* _ub,
uint64_t_btree_t* _used_omap_head, uint64_t_btree_t* _used_omap_head,
uint64_t_btree_t* _used_per_pool_omap_head,
uint64_t_btree_t* _used_pgmeta_omap_head,
ceph::mutex* _sb_info_lock, ceph::mutex* _sb_info_lock,
sb_info_map_t& _sb_info, sb_info_map_t& _sb_info,
store_statfs_t& _store_statfs, store_statfs_t& _store_statfs,
@ -3272,8 +3272,6 @@ public:
num_spanning_blobs(_num_spanning_blobs), num_spanning_blobs(_num_spanning_blobs),
used_blocks(_ub), used_blocks(_ub),
used_omap_head(_used_omap_head), used_omap_head(_used_omap_head),
used_per_pool_omap_head(_used_per_pool_omap_head),
used_pgmeta_omap_head(_used_pgmeta_omap_head),
sb_info_lock(_sb_info_lock), sb_info_lock(_sb_info_lock),
sb_info(_sb_info), sb_info(_sb_info),
expected_store_statfs(_store_statfs), expected_store_statfs(_store_statfs),
@ -3294,6 +3292,10 @@ public:
const BlueStore::FSCK_ObjectCtx& ctx); const BlueStore::FSCK_ObjectCtx& ctx);
private: private:
void _fsck_check_object_omap(FSCKDepth depth,
OnodeRef& o,
const BlueStore::FSCK_ObjectCtx& ctx);
void _fsck_check_objects(FSCKDepth depth, void _fsck_check_objects(FSCKDepth depth,
FSCK_ObjectCtx& ctx); FSCK_ObjectCtx& ctx);
}; };
@ -3483,6 +3485,7 @@ public:
++to_repair_cnt; ++to_repair_cnt;
} }
} }
// In fact this is the only repairer's method which is thread-safe!!
void inc_repaired() { void inc_repaired() {
++to_repair_cnt; ++to_repair_cnt;
} }
@ -3498,7 +3501,7 @@ public:
} }
private: private:
unsigned to_repair_cnt = 0; std::atomic<unsigned> to_repair_cnt = { 0 };
KeyValueDB::Transaction fix_per_pool_omap_txn; KeyValueDB::Transaction fix_per_pool_omap_txn;
KeyValueDB::Transaction fix_fm_leaked_txn; KeyValueDB::Transaction fix_fm_leaked_txn;
KeyValueDB::Transaction fix_fm_false_free_txn; KeyValueDB::Transaction fix_fm_false_free_txn;

View File

@ -431,10 +431,10 @@ int main(int argc, char **argv)
r = bluestore.quick_fix(); r = bluestore.quick_fix();
} }
if (r < 0) { if (r < 0) {
cerr << "error from fsck: " << cpp_strerror(r) << std::endl; cerr << action << " failed: " << cpp_strerror(r) << std::endl;
exit(EXIT_FAILURE); exit(EXIT_FAILURE);
} else if (r > 0) { } else if (r > 0) {
cerr << action << " found " << r << " error(s)" << std::endl; cerr << action << " status: remaining " << r << " error(s) and warning(s)" << std::endl;
exit(EXIT_FAILURE); exit(EXIT_FAILURE);
} else { } else {
cout << action << " success" << std::endl; cout << action << " success" << std::endl;

View File

@ -3520,10 +3520,10 @@ int main(int argc, char **argv)
return 1; return 1;
} }
if (r > 0) { if (r > 0) {
cerr << "fsck found " << r << " errors" << std::endl; cerr << "fsck status: " << r << " remaining error(s) and warning(s)" << std::endl;
return 1; return 1;
} }
cout << "fsck found no errors" << std::endl; cout << "fsck success" << std::endl;
return 0; return 0;
} }
if (op == "repair" || op == "repair-deep") { if (op == "repair" || op == "repair-deep") {
@ -3533,10 +3533,10 @@ int main(int argc, char **argv)
return 1; return 1;
} }
if (r > 0) { if (r > 0) {
cerr << "repair found " << r << " errors" << std::endl; cerr << "repair status: " << r << " remaining error(s) and warning(s)" << std::endl;
return 1; return 1;
} }
cout << "repair found no errors" << std::endl; cout << "repair success" << std::endl;
return 0; return 0;
} }
if (op == "mkfs") { if (op == "mkfs") {