Merge PR 16202 into master

* refs/remotes/upstream/pull/16202/head:
	doc: add some docs about 'cephfs-data-scan scan_links'
	mds/FSMap.cc: remember stopped mds when resetting filesystem
	tools/cephfs: handle removed dentries when replaying journal

Reviewed-by: Patrick Donnelly <pdonnell@redhat.com>
This commit is contained in:
Patrick Donnelly 2017-07-17 22:33:49 -07:00
commit ba548ffe26
No known key found for this signature in database
GPG Key ID: 3A2A7E25BEA8AADB
3 changed files with 81 additions and 11 deletions

View File

@ -130,18 +130,20 @@ objects.
Finally, you can regenerate metadata objects for missing files
and directories based on the contents of a data pool. This is
a two-phase process. First, scanning *all* objects to calculate
a three-phase process. First, scanning *all* objects to calculate
size and mtime metadata for inodes. Second, scanning the first
object from every file to collect this metadata and inject
it into the metadata pool.
object from every file to collect this metadata and inject it into
the metadata pool. Third, checking inode linkages and fixing found
errors.
::
cephfs-data-scan scan_extents <data pool>
cephfs-data-scan scan_inodes <data pool>
cephfs-data-scan scan_links
This command may take a *very long* time if there are many
files or very large files in the data pool.
'scan_extents' and 'scan_inodes' commands may take a *very long* time
if there are many files or very large files in the data pool.
To accelerate the process, run multiple instances of the tool.
@ -246,7 +248,7 @@ it with empty file system data structures:
ceph osd pool create recovery <pg-num> replicated <crush-ruleset-name>
ceph fs new recovery-fs recovery <data pool> --allow-dangerous-metadata-overlay
cephfs-data-scan init --force-init --filesystem recovery-fs --alternate-pool recovery
ceph fs reset recovery-fs --yes-i-realy-mean-it
ceph fs reset recovery-fs --yes-i-really-mean-it
cephfs-table-tool recovery-fs:all reset session
cephfs-table-tool recovery-fs:all reset snap
cephfs-table-tool recovery-fs:all reset inode
@ -256,8 +258,9 @@ results to the alternate pool:
::
cephfs-data-scan scan_extents --alternate-pool recovery --filesystem <original filesystem name>
cephfs-data-scan scan_extents --alternate-pool recovery --filesystem <original filesystem name> <original data pool name>
cephfs-data-scan scan_inodes --alternate-pool recovery --filesystem <original filesystem name> --force-corrupt --force-init <original data pool name>
cephfs-data-scan scan_links --filesystem recovery-fs
If the damaged filesystem contains dirty journal data, it may be recovered next
with:
@ -267,10 +270,10 @@ with:
cephfs-journal-tool --rank=<original filesystem name>:0 event recover_dentries list --alternate-pool recovery
cephfs-journal-tool --rank recovery-fs:0 journal reset --force
After recovery, some recovered directories will have incorrect link counts.
Ensure the parameter mds_debug_scatterstat is set to false (the default) to
prevent the MDS from checking the link counts, then run a forward scrub to
repair them. Ensure you have an MDS running and issue:
After recovery, some recovered directories will have incorrect statistics.
Ensure the parameters mds_verify_scatter and mds_debug_scatterstat are set
to false (the default) to prevent the MDS from checking the statistics, then
run a forward scrub to repair them. Ensure you have an MDS running and issue:
::

View File

@ -295,6 +295,12 @@ void FSMap::reset_filesystem(fs_cluster_id_t fscid)
new_fs->mds_map.standby_count_wanted = fs->mds_map.standby_count_wanted;
new_fs->mds_map.enabled = true;
// Remember mds ranks that have ever started. (They should load old inotable
// instead of creating new one if they start again.)
new_fs->mds_map.stopped.insert(fs->mds_map.in.begin(), fs->mds_map.in.end());
new_fs->mds_map.stopped.insert(fs->mds_map.stopped.begin(), fs->mds_map.stopped.end());
new_fs->mds_map.stopped.erase(mds_rank_t(0));
// Persist the new FSMap
filesystems[new_fs->fscid] = new_fs;
}

View File

@ -720,6 +720,15 @@ int JournalTool::recover_dentries(
read_keys.insert(key);
}
list<EMetaBlob::nullbit> const &nb_list = lump.get_dnull();
for (auto& nb : nb_list) {
// Get a key like "foobar_head"
std::string key;
dentry_key_t dn_key(nb.dnlast, nb.dn.c_str());
dn_key.encode(key);
read_keys.insert(key);
}
// Perform bulk read of existing dentries
std::map<std::string, bufferlist> read_vals;
r = input.omap_get_vals_by_keys(frag_oid.name, read_keys, &read_vals);
@ -866,6 +875,48 @@ int JournalTool::recover_dentries(
}
}
std::set<std::string> null_vals;
for (auto& nb : nb_list) {
std::string key;
dentry_key_t dn_key(nb.dnlast, nb.dn.c_str());
dn_key.encode(key);
dout(4) << "inspecting nullbit " << frag_oid.name << "/" << nb.dn
<< dendl;
auto it = read_vals.find(key);
if (it != read_vals.end()) {
dout(4) << "dentry exists, will remove" << dendl;
bufferlist::iterator q = it->second.begin();
snapid_t dnfirst;
::decode(dnfirst, q);
char dentry_type;
::decode(dentry_type, q);
bool remove_dentry = false;
if (dentry_type == 'L') {
dout(10) << "Existing hardlink inode in slot to be (maybe) removed "
<< "by null journal dn '" << nb.dn.c_str()
<< "' with lump fnode version " << lump.fnode.version
<< "vs existing fnode version " << old_fnode_version << dendl;
remove_dentry = old_fnode_version < lump.fnode.version;
} else if (dentry_type == 'I') {
dout(10) << "Existing full inode in slot to be (maybe) removed "
<< "by null journal dn '" << nb.dn.c_str()
<< "' with lump fnode version " << lump.fnode.version
<< "vs existing fnode version " << old_fnode_version << dendl;
remove_dentry = old_fnode_version < lump.fnode.version;
} else {
dout(4) << "corrupt dentry in backing store, will remove" << dendl;
remove_dentry = true;
}
if (remove_dentry)
null_vals.insert(key);
}
}
// Write back any new/changed dentries
if (!write_vals.empty()) {
r = output.omap_set(frag_oid.name, write_vals);
@ -875,6 +926,16 @@ int JournalTool::recover_dentries(
return r;
}
}
// remove any null dentries
if (!null_vals.empty()) {
r = output.omap_rm_keys(frag_oid.name, null_vals);
if (r != 0) {
derr << "error removing dentries from " << frag_oid.name
<< ": " << cpp_strerror(r) << dendl;
return r;
}
}
}
/* Now that we've looked at the dirlumps, we finally pay attention to