From a7126649655ef7f6b1e067d68b54d555172f9341 Mon Sep 17 00:00:00 2001 From: Greg Farnum Date: Wed, 2 Mar 2011 18:49:39 -0800 Subject: [PATCH 1/2] CDir: pay attention to the max_dir_commit_size! Somehow it seems to have been ignoring this previously, which doesn't make any sense at all since otherwise our tests on it wouldn't have worked. Perhaps there was a merging error somewhere? Signed-off-by: Greg Farnum --- src/mds/CDir.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/mds/CDir.cc b/src/mds/CDir.cc index e19f91d7889..c0005563719 100644 --- a/src/mds/CDir.cc +++ b/src/mds/CDir.cc @@ -1776,7 +1776,7 @@ void CDir::_commit(version_t want) ObjectOperation m; map_t::iterator committed_dn; - unsigned max_write_size = -1; + unsigned max_write_size = cache->max_dir_commit_size; // update parent pointer while we're here. // NOTE: the pointer is ONLY required to be valid for the first frag. we put the xattr From 1ad48f12d8e252ea8c561460aafcbc6b627c8b6e Mon Sep 17 00:00:00 2001 From: Greg Farnum Date: Wed, 2 Mar 2011 18:52:51 -0800 Subject: [PATCH 2/2] CDir: Don't write out the header on a partial commit. If we write out the header as part of a partial commit, and then fail to complete a subsequent commit (network error, we crash, etc) then the on-disk version of the directory is not correctly versioned. The fact that some dentries are of a newer version than others is okay because we will fix it up during journal replay, but if the header says the directory is fully committed to the end of the journal that won't happen! So, take advantage of how messages between two daemons are strictly ordered, and how messages for a given PG are strictly ordered, and simply include the partial commit that contains the new header last. Signed-off-by: Greg Farnum --- src/mds/CDir.cc | 35 ++++++++++++++++++++++++++--------- 1 file changed, 26 insertions(+), 9 deletions(-) diff --git a/src/mds/CDir.cc b/src/mds/CDir.cc index c0005563719..c99dc8db874 100644 --- a/src/mds/CDir.cc +++ b/src/mds/CDir.cc @@ -1607,6 +1607,11 @@ CDir::map_t::iterator CDir::_commit_full(ObjectOperation& m, const set * last dentry as encoded.) * * If we're passed a last_committed_dn, skip to the next dentry after that. + * Also, don't encode the header again -- we don't want to update it + * on-disk until all the updates have made it through, so keep the header + * in only the first changeset -- our caller is responsible for making sure + * that changeset doesn't go through until after all the others do, if it's + * necessary. */ CDir::map_t::iterator CDir::_commit_partial(ObjectOperation& m, const set *snaps, @@ -1617,10 +1622,12 @@ CDir::map_t::iterator CDir::_commit_partial(ObjectOperation& m, bufferlist finalbl; // header - bufferlist header; - ::encode(fnode, header); - finalbl.append(CEPH_OSD_TMAP_HDR); - ::encode(header, finalbl); + if (last_committed_dn == map_t::iterator()) { + bufferlist header; + ::encode(fnode, header); + finalbl.append(CEPH_OSD_TMAP_HDR); + ::encode(header, finalbl); + } // updated dentries map_t::iterator p = items.begin(); @@ -1805,14 +1812,24 @@ void CDir::_commit(version_t want) else { // send in a different Context C_Gather *gather = new C_Gather(new C_Dir_Committed(this, get_version(), inode->inode.last_renamed_version)); - cache->mds->objecter->mutate(oid, oloc, m, snapc, g_clock.now(), 0, NULL, - gather->new_sub()); while (committed_dn != items.end()) { - m = ObjectOperation(); - committed_dn = _commit_partial(m, snaps, max_write_size, committed_dn); - cache->mds->objecter->mutate(oid, oloc, m, snapc, g_clock.now(), 0, NULL, + ObjectOperation n = ObjectOperation(); + committed_dn = _commit_partial(n, snaps, max_write_size, committed_dn); + cache->mds->objecter->mutate(oid, oloc, n, snapc, g_clock.now(), 0, NULL, gather->new_sub()); } + /* + * save the original object for last -- it contains the new header, + * which will be committed on-disk. If we were to send it off before + * the other commits, but die before sending them all, we'd think + * that the on-disk state was fully committed even though it wasn't! + * However, since the messages are strictly ordered between the MDS and + * the OSD, and since messages to a given PG are strictly ordered, if + * we simply send the message containing the header off last, we cannot + * get our header into an incorrect state. + */ + cache->mds->objecter->mutate(oid, oloc, m, snapc, g_clock.now(), 0, NULL, + gather->new_sub()); } }